/*
 * Copyright (c) 2018-2019 Arm Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#ifdef __aarch64__

#include <algorithm>

#include "arm_gemm.hpp"

#include "../../asmlib.hpp"
#include "../../utils.hpp"

namespace arm_gemm {

void a64_hybrid_fp32_mla_16x4_a55(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool append) {
    const int K_stride = K;
    const long loops_count = ((K + 4) / 8) - 1;
    K -= loops_count * 8;
    const long regs_count = (K / 4) - 1;
    K -= (regs_count + 1) * 4;
    const long blocks_count = K / 1;
    float nullbias[16];
    if (!append && !bias) {
        memset(nullbias, 0, (16 * sizeof(float)));
    }
    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
    const float * const minptr = &minval;
    const float * const maxptr = &maxval;

    switch(act.type)
    {
        default:
        case Activation::Type::None:
            break;
        case Activation::Type::BoundedReLU:
            maxval = static_cast<float>(act.param1);
            /* fall through */
        case Activation::Type::ReLU:
            minval = 0.0f;
            break;
    }

    for (int y=0; y<M; y+=4) {
        const float * const a_ptr0_base = A + (y * lda);
        const unsigned long ldab = lda * sizeof(float);

        float *c_ptr0 = C + (y * ldc);

        for (int x0=0; x0<N; x0+=16ul) {
            const long width = std::min((unsigned long)N-x0, 16ul);
            long loops = loops_count;
            long regs = regs_count;
            long blocks = blocks_count;
            const float *a_ptr0 = a_ptr0_base;
            const float *b_ptr0 = B + (K_stride * x0);
            const bool use_result_buffer = (width < 16);
            float result_buffer[64];
            const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(float);
            float *c_ptr_real = c_ptr0;
            if (use_result_buffer && append) {
                for(int cy=0; cy<std::min(M-y, 4); cy++) {
                    for(unsigned int cx=0; cx<width; cx++) {
                        result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
                    }
                }
            }
            if (use_result_buffer) {
                c_ptr0 = result_buffer;
            }
            const float *biasptr = bias ? bias+x0 : nullbias;

            switch(M-y) {
                case 1:
                    __asm __volatile (
                        "temploadreg0 .req X0\n"
                        "temploadreg1 .req X1\n"
                        "temploadreg2 .req X2\n"
                        "temploadreg3 .req X3\n"
                        "cbnz %[append], 1f\n"
                        "ldr q16, [%[biasptr]]\n"
                        "ldr q17, [%[biasptr], #0x10]\n"
                        "ldr q18, [%[biasptr], #0x20]\n"
                        "ldr q19, [%[biasptr], #0x30]\n"
                        "ldr q0, [%[a_ptr0]]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "ldr q12, [%[b_ptr0], #0x40]\n"
                        "ldr q13, [%[b_ptr0], #0x50]\n"
                        "ldr d14, [%[b_ptr0], #0x60]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
                        "cbz %[loops], 2f\n"
                        "b 3f\n"
                        "1:\n"
                        "ldr q16, [%[c_ptr0]]\n"
                        "ldr q17, [%[c_ptr0], #0x10]\n"
                        "ldr q18, [%[c_ptr0], #0x20]\n"
                        "ldr q19, [%[c_ptr0], #0x30]\n"
                        "ldr q0, [%[a_ptr0]]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "ldr q12, [%[b_ptr0], #0x40]\n"
                        "ldr q13, [%[b_ptr0], #0x50]\n"
                        "ldr d14, [%[b_ptr0], #0x60]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
                        "cbz %[loops], 2f\n"
                        "3:\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "ins v14.d[1], temploadreg2\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "ldr d15, [%[b_ptr0], #-0x10]\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "ldr d4, [%[a_ptr0]]\n"
                        "fmla v16.4s, v12.4s, v0.s[1]\n"
                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
                        "fmla v17.4s, v13.4s, v0.s[1]\n"
                        "ldr d8, [%[b_ptr0]]\n"
                        "fmla v18.4s, v14.4s, v0.s[1]\n"
                        "ldr d9, [%[b_ptr0], #0x10]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                        "subs %[loops], %[loops], #0x1\n"
                        "ins v4.d[1], temploadreg0\n"
                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                        "ldr d10, [%[b_ptr0], #0x20]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
                        "ldr d11, [%[b_ptr0], #0x30]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
                        "ldr d12, [%[b_ptr0], #0x40]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "fmla v19.4s, v15.4s, v0.s[1]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
                        "ldr d13, [%[b_ptr0], #0x50]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "fmla v16.4s, v8.4s, v0.s[2]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
                        "ldr d14, [%[b_ptr0], #0x60]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "fmla v17.4s, v9.4s, v0.s[2]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                        "ldr d15, [%[b_ptr0], #0x70]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "fmla v18.4s, v10.4s, v0.s[2]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
                        "ins v12.d[1], temploadreg0\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v19.4s, v11.4s, v0.s[2]\n"
                        "ldr d8, [%[b_ptr0], #-0x80]\n"
                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
                        "fmla v16.4s, v12.4s, v0.s[3]\n"
                        "ldr d9, [%[b_ptr0], #-0x70]\n"
                        "ins v13.d[1], temploadreg1\n"
                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
                        "ldr d10, [%[b_ptr0], #-0x60]\n"
                        "ins v14.d[1], temploadreg2\n"
                        "fmla v17.4s, v13.4s, v0.s[3]\n"
                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
                        "ldr d11, [%[b_ptr0], #-0x50]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "fmla v18.4s, v14.4s, v0.s[3]\n"
                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
                        "ldr d12, [%[b_ptr0], #-0x40]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "fmla v19.4s, v15.4s, v0.s[3]\n"
                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
                        "ldr d13, [%[b_ptr0], #-0x30]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "fmla v16.4s, v8.4s, v4.s[0]\n"
                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
                        "ldr d14, [%[b_ptr0], #-0x20]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "fmla v17.4s, v9.4s, v4.s[0]\n"
                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
                        "ldr d15, [%[b_ptr0], #-0x10]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "fmla v18.4s, v10.4s, v4.s[0]\n"
                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
                        "ldr d0, [%[a_ptr0], #-0x10]\n"
                        "ins v12.d[1], temploadreg0\n"
                        "fmla v19.4s, v11.4s, v4.s[0]\n"
                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
                        "ldr d8, [%[b_ptr0]]\n"
                        "ldr d9, [%[b_ptr0], #0x10]\n"
                        "fmla v16.4s, v12.4s, v4.s[1]\n"
                        "ins v0.d[1], temploadreg0\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                        "ins v13.d[1], temploadreg1\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                        "ldr d10, [%[b_ptr0], #0x20]\n"
                        "ins v14.d[1], temploadreg2\n"
                        "fmla v17.4s, v13.4s, v4.s[1]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
                        "ldr d11, [%[b_ptr0], #0x30]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "fmla v18.4s, v14.4s, v4.s[1]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
                        "ldr d12, [%[b_ptr0], #0x40]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "fmla v19.4s, v15.4s, v4.s[1]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
                        "ldr d13, [%[b_ptr0], #0x50]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "fmla v16.4s, v8.4s, v4.s[2]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
                        "ldr d14, [%[b_ptr0], #0x60]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "fmla v17.4s, v9.4s, v4.s[2]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                        "ldr d15, [%[b_ptr0], #0x70]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "fmla v18.4s, v10.4s, v4.s[2]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
                        "ins v12.d[1], temploadreg0\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v19.4s, v11.4s, v4.s[2]\n"
                        "ldr d8, [%[b_ptr0], #-0x80]\n"
                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
                        "fmla v16.4s, v12.4s, v4.s[3]\n"
                        "ldr d9, [%[b_ptr0], #-0x70]\n"
                        "ins v13.d[1], temploadreg1\n"
                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
                        "ldr d10, [%[b_ptr0], #-0x60]\n"
                        "ins v14.d[1], temploadreg2\n"
                        "fmla v17.4s, v13.4s, v4.s[3]\n"
                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
                        "ldr d11, [%[b_ptr0], #-0x50]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "fmla v18.4s, v14.4s, v4.s[3]\n"
                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
                        "ldr d12, [%[b_ptr0], #-0x40]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "fmla v19.4s, v15.4s, v4.s[3]\n"
                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
                        "ldr d13, [%[b_ptr0], #-0x30]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
                        "ldr d14, [%[b_ptr0], #-0x20]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "ins v12.d[1], temploadreg0\n"
                        "ins v13.d[1], temploadreg1\n"
                        "b.ne 3b\n"
                        "2:\n"
                        "ins v14.d[1], temploadreg2\n"
                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
                        "ldr d15, [%[b_ptr0], #-0x10]\n"
                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "cbz %[regs], 4f\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "ldr d4, [%[a_ptr0]]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "ldr d8, [%[b_ptr0]]\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "ldr d9, [%[b_ptr0], #0x10]\n"
                        "fmla v16.4s, v12.4s, v0.s[1]\n"
                        "ins v4.d[1], temploadreg0\n"
                        "fmla v17.4s, v13.4s, v0.s[1]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                        "fmla v18.4s, v14.4s, v0.s[1]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                        "fmla v19.4s, v15.4s, v0.s[1]\n"
                        "ldr d10, [%[b_ptr0], #0x20]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "ldr d11, [%[b_ptr0], #0x30]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
                        "ldr d12, [%[b_ptr0], #0x40]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
                        "ldr d13, [%[b_ptr0], #0x50]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "fmla v16.4s, v8.4s, v0.s[2]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
                        "ldr d14, [%[b_ptr0], #0x60]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "fmla v17.4s, v9.4s, v0.s[2]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                        "ldr d15, [%[b_ptr0], #0x70]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "fmla v18.4s, v10.4s, v0.s[2]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
                        "ins v12.d[1], temploadreg0\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v19.4s, v11.4s, v0.s[2]\n"
                        "ldr d8, [%[b_ptr0], #-0x80]\n"
                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
                        "fmla v16.4s, v12.4s, v0.s[3]\n"
                        "ldr d9, [%[b_ptr0], #-0x70]\n"
                        "ins v13.d[1], temploadreg1\n"
                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
                        "ldr d10, [%[b_ptr0], #-0x60]\n"
                        "ins v14.d[1], temploadreg2\n"
                        "fmla v17.4s, v13.4s, v0.s[3]\n"
                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
                        "ldr d11, [%[b_ptr0], #-0x50]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "fmla v18.4s, v14.4s, v0.s[3]\n"
                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
                        "ldr d12, [%[b_ptr0], #-0x40]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "fmla v19.4s, v15.4s, v0.s[3]\n"
                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
                        "ldr d13, [%[b_ptr0], #-0x30]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "fmla v16.4s, v8.4s, v4.s[0]\n"
                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
                        "ldr d14, [%[b_ptr0], #-0x20]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "fmla v17.4s, v9.4s, v4.s[0]\n"
                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
                        "ldr d15, [%[b_ptr0], #-0x10]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "fmla v18.4s, v10.4s, v4.s[0]\n"
                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
                        "ldr d8, [%[b_ptr0]]\n"
                        "ins v12.d[1], temploadreg0\n"
                        "fmla v19.4s, v11.4s, v4.s[0]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                        "ldr d9, [%[b_ptr0], #0x10]\n"
                        "ins v13.d[1], temploadreg1\n"
                        "fmla v16.4s, v12.4s, v4.s[1]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                        "ldr d10, [%[b_ptr0], #0x20]\n"
                        "ins v14.d[1], temploadreg2\n"
                        "fmla v17.4s, v13.4s, v4.s[1]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
                        "ldr d11, [%[b_ptr0], #0x30]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "fmla v18.4s, v14.4s, v4.s[1]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
                        "ldr d12, [%[b_ptr0], #0x40]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "fmla v19.4s, v15.4s, v4.s[1]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
                        "ldr d13, [%[b_ptr0], #0x50]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "fmla v16.4s, v8.4s, v4.s[2]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
                        "ldr d14, [%[b_ptr0], #0x60]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "fmla v17.4s, v9.4s, v4.s[2]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                        "ldr d15, [%[b_ptr0], #0x70]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "fmla v18.4s, v10.4s, v4.s[2]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
                        "ins v12.d[1], temploadreg0\n"
                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
                        "fmla v19.4s, v11.4s, v4.s[2]\n"
                        "ins v13.d[1], temploadreg1\n"
                        "ins v14.d[1], temploadreg2\n"
                        "fmla v16.4s, v12.4s, v4.s[3]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "fmla v17.4s, v13.4s, v4.s[3]\n"
                        "fmla v18.4s, v14.4s, v4.s[3]\n"
                        "fmla v19.4s, v15.4s, v4.s[3]\n"
                        "b 5f\n"
                        "4:\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "ldr d8, [%[b_ptr0]]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "ldr d9, [%[b_ptr0], #0x10]\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                        "fmla v16.4s, v12.4s, v0.s[1]\n"
                        "ldr d10, [%[b_ptr0], #0x20]\n"
                        "fmla v17.4s, v13.4s, v0.s[1]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
                        "fmla v18.4s, v14.4s, v0.s[1]\n"
                        "ldr d11, [%[b_ptr0], #0x30]\n"
                        "fmla v19.4s, v15.4s, v0.s[1]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
                        "ldr d12, [%[b_ptr0], #0x40]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
                        "ldr d13, [%[b_ptr0], #0x50]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "fmla v16.4s, v8.4s, v0.s[2]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
                        "ldr d14, [%[b_ptr0], #0x60]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "fmla v17.4s, v9.4s, v0.s[2]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                        "ldr d15, [%[b_ptr0], #0x70]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "fmla v18.4s, v10.4s, v0.s[2]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
                        "ins v12.d[1], temploadreg0\n"
                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
                        "fmla v19.4s, v11.4s, v0.s[2]\n"
                        "ins v13.d[1], temploadreg1\n"
                        "ins v14.d[1], temploadreg2\n"
                        "fmla v16.4s, v12.4s, v0.s[3]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "fmla v17.4s, v13.4s, v0.s[3]\n"
                        "fmla v18.4s, v14.4s, v0.s[3]\n"
                        "fmla v19.4s, v15.4s, v0.s[3]\n"
                        "5:\n"
                        "cbz %[blocks], 6f\n"
                        "7:\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "subs %[blocks], %[blocks], #0x1\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "ldr s0, [%[a_ptr0]]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "b.ne 7b\n"
                        "6:\n"
                        "ld1r {v14.4s}, [%[minptr]]\n"
                        "ld1r {v15.4s}, [%[maxptr]]\n"
                        "fmax v16.4s, v16.4s, v14.4s\n"
                        "fmax v17.4s, v17.4s, v14.4s\n"
                        "fmax v18.4s, v18.4s, v14.4s\n"
                        "fmax v19.4s, v19.4s, v14.4s\n"
                        "fmin v16.4s, v16.4s, v15.4s\n"
                        "fmin v17.4s, v17.4s, v15.4s\n"
                        "fmin v18.4s, v18.4s, v15.4s\n"
                        "fmin v19.4s, v19.4s, v15.4s\n"
                        "str q16, [%[c_ptr0]]\n"
                        "str q17, [%[c_ptr0], #0x10]\n"
                        "str q18, [%[c_ptr0], #0x20]\n"
                        "str q19, [%[c_ptr0], #0x30]\n"
                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
                        ".unreq temploadreg0\n"
                        ".unreq temploadreg1\n"
                        ".unreq temploadreg2\n"
                        ".unreq temploadreg3\n"
                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
                    );
                    break;
                case 2:
                    __asm __volatile (
                        "a_ptr1 .req X0\n"
                        "c_ptr1 .req X1\n"
                        "temploadreg0 .req X2\n"
                        "temploadreg1 .req X3\n"
                        "temploadreg2 .req X4\n"
                        "temploadreg3 .req X5\n"
                        "add a_ptr1, %[a_ptr0], %[lda]\n"
                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
                        "cbnz %[append], 1f\n"
                        "ldr q16, [%[biasptr]]\n"
                        "ldr q17, [%[biasptr], #0x10]\n"
                        "ldr q18, [%[biasptr], #0x20]\n"
                        "ldr q19, [%[biasptr], #0x30]\n"
                        "mov v20.16b, v16.16b\n"
                        "ldr q0, [%[a_ptr0]]\n"
                        "mov v21.16b, v17.16b\n"
                        "ldr q1, [a_ptr1]\n"
                        "mov v22.16b, v18.16b\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "mov v23.16b, v19.16b\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "ldr q12, [%[b_ptr0], #0x40]\n"
                        "ldr q13, [%[b_ptr0], #0x50]\n"
                        "ldr d14, [%[b_ptr0], #0x60]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
                        "cbz %[loops], 2f\n"
                        "b 3f\n"
                        "1:\n"
                        "ldr q16, [%[c_ptr0]]\n"
                        "ldr q17, [%[c_ptr0], #0x10]\n"
                        "ldr q18, [%[c_ptr0], #0x20]\n"
                        "ldr q19, [%[c_ptr0], #0x30]\n"
                        "ldr q20, [c_ptr1]\n"
                        "ldr q21, [c_ptr1, #0x10]\n"
                        "ldr q22, [c_ptr1, #0x20]\n"
                        "ldr q23, [c_ptr1, #0x30]\n"
                        "ldr q0, [%[a_ptr0]]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "ldr q1, [a_ptr1]\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "ldr q12, [%[b_ptr0], #0x40]\n"
                        "ldr q13, [%[b_ptr0], #0x50]\n"
                        "ldr d14, [%[b_ptr0], #0x60]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
                        "cbz %[loops], 2f\n"
                        "3:\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "ins v14.d[1], temploadreg2\n"
                        "fmla v20.4s, v8.4s, v1.s[0]\n"
                        "ldr d15, [%[b_ptr0], #-0x10]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
                        "fmla v21.4s, v9.4s, v1.s[0]\n"
                        "ldr d4, [%[a_ptr0]]\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
                        "fmla v22.4s, v10.4s, v1.s[0]\n"
                        "ldr d5, [a_ptr1]\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
                        "fmla v23.4s, v11.4s, v1.s[0]\n"
                        "ldr d8, [%[b_ptr0]]\n"
                        "fmla v16.4s, v12.4s, v0.s[1]\n"
                        "ins v4.d[1], temploadreg0\n"
                        "fmla v20.4s, v12.4s, v1.s[1]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                        "fmla v17.4s, v13.4s, v0.s[1]\n"
                        "ldr d9, [%[b_ptr0], #0x10]\n"
                        "fmla v21.4s, v13.4s, v1.s[1]\n"
                        "ins v5.d[1], temploadreg1\n"
                        "fmla v18.4s, v14.4s, v0.s[1]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                        "fmla v22.4s, v14.4s, v1.s[1]\n"
                        "ldr d10, [%[b_ptr0], #0x20]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
                        "subs %[loops], %[loops], #0x1\n"
                        "ldr d11, [%[b_ptr0], #0x30]\n"
                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
                        "add a_ptr1, a_ptr1, #0x20\n"
                        "fmla v19.4s, v15.4s, v0.s[1]\n"
                        "ldr d12, [%[b_ptr0], #0x40]\n"
                        "fmla v23.4s, v15.4s, v1.s[1]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
                        "ldr d13, [%[b_ptr0], #0x50]\n"
                        "fmla v16.4s, v8.4s, v0.s[2]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "fmla v20.4s, v8.4s, v1.s[2]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
                        "ldr d14, [%[b_ptr0], #0x60]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "fmla v17.4s, v9.4s, v0.s[2]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                        "fmla v21.4s, v9.4s, v1.s[2]\n"
                        "ldr d15, [%[b_ptr0], #0x70]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "fmla v18.4s, v10.4s, v0.s[2]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
                        "fmla v22.4s, v10.4s, v1.s[2]\n"
                        "ins v12.d[1], temploadreg0\n"
                        "ins v13.d[1], temploadreg1\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v19.4s, v11.4s, v0.s[2]\n"
                        "ldr d8, [%[b_ptr0], #-0x80]\n"
                        "fmla v23.4s, v11.4s, v1.s[2]\n"
                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
                        "fmla v16.4s, v12.4s, v0.s[3]\n"
                        "ldr d9, [%[b_ptr0], #-0x70]\n"
                        "fmla v20.4s, v12.4s, v1.s[3]\n"
                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
                        "fmla v17.4s, v13.4s, v0.s[3]\n"
                        "ldr d10, [%[b_ptr0], #-0x60]\n"
                        "fmla v21.4s, v13.4s, v1.s[3]\n"
                        "ins v14.d[1], temploadreg2\n"
                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
                        "ldr d11, [%[b_ptr0], #-0x50]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "fmla v18.4s, v14.4s, v0.s[3]\n"
                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
                        "fmla v22.4s, v14.4s, v1.s[3]\n"
                        "ldr d12, [%[b_ptr0], #-0x40]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "fmla v19.4s, v15.4s, v0.s[3]\n"
                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
                        "fmla v23.4s, v15.4s, v1.s[3]\n"
                        "ldr d13, [%[b_ptr0], #-0x30]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "fmla v16.4s, v8.4s, v4.s[0]\n"
                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
                        "fmla v20.4s, v8.4s, v5.s[0]\n"
                        "ldr d14, [%[b_ptr0], #-0x20]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "fmla v17.4s, v9.4s, v4.s[0]\n"
                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
                        "fmla v21.4s, v9.4s, v5.s[0]\n"
                        "ldr d15, [%[b_ptr0], #-0x10]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "fmla v18.4s, v10.4s, v4.s[0]\n"
                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
                        "fmla v22.4s, v10.4s, v5.s[0]\n"
                        "ldr d0, [%[a_ptr0], #-0x10]\n"
                        "ins v12.d[1], temploadreg0\n"
                        "fmla v19.4s, v11.4s, v4.s[0]\n"
                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
                        "fmla v23.4s, v11.4s, v5.s[0]\n"
                        "ldr d1, [a_ptr1, #-0x10]\n"
                        "ins v13.d[1], temploadreg1\n"
                        "fmla v16.4s, v12.4s, v4.s[1]\n"
                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
                        "fmla v20.4s, v12.4s, v5.s[1]\n"
                        "ldr d8, [%[b_ptr0]]\n"
                        "ins v0.d[1], temploadreg0\n"
                        "fmla v17.4s, v13.4s, v4.s[1]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                        "fmla v21.4s, v13.4s, v5.s[1]\n"
                        "ldr d9, [%[b_ptr0], #0x10]\n"
                        "ins v1.d[1], temploadreg1\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                        "ldr d10, [%[b_ptr0], #0x20]\n"
                        "ins v14.d[1], temploadreg2\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
                        "ldr d11, [%[b_ptr0], #0x30]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "fmla v18.4s, v14.4s, v4.s[1]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
                        "fmla v22.4s, v14.4s, v5.s[1]\n"
                        "ldr d12, [%[b_ptr0], #0x40]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "fmla v19.4s, v15.4s, v4.s[1]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
                        "fmla v23.4s, v15.4s, v5.s[1]\n"
                        "ldr d13, [%[b_ptr0], #0x50]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "fmla v16.4s, v8.4s, v4.s[2]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
                        "fmla v20.4s, v8.4s, v5.s[2]\n"
                        "ldr d14, [%[b_ptr0], #0x60]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "fmla v17.4s, v9.4s, v4.s[2]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                        "fmla v21.4s, v9.4s, v5.s[2]\n"
                        "ldr d15, [%[b_ptr0], #0x70]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "fmla v18.4s, v10.4s, v4.s[2]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
                        "fmla v22.4s, v10.4s, v5.s[2]\n"
                        "ins v12.d[1], temploadreg0\n"
                        "ins v13.d[1], temploadreg1\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v19.4s, v11.4s, v4.s[2]\n"
                        "ldr d8, [%[b_ptr0], #-0x80]\n"
                        "fmla v23.4s, v11.4s, v5.s[2]\n"
                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
                        "fmla v16.4s, v12.4s, v4.s[3]\n"
                        "ldr d9, [%[b_ptr0], #-0x70]\n"
                        "fmla v20.4s, v12.4s, v5.s[3]\n"
                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
                        "fmla v17.4s, v13.4s, v4.s[3]\n"
                        "ldr d10, [%[b_ptr0], #-0x60]\n"
                        "fmla v21.4s, v13.4s, v5.s[3]\n"
                        "ins v14.d[1], temploadreg2\n"
                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
                        "ldr d11, [%[b_ptr0], #-0x50]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "fmla v18.4s, v14.4s, v4.s[3]\n"
                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
                        "fmla v22.4s, v14.4s, v5.s[3]\n"
                        "ldr d12, [%[b_ptr0], #-0x40]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "fmla v19.4s, v15.4s, v4.s[3]\n"
                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
                        "fmla v23.4s, v15.4s, v5.s[3]\n"
                        "ldr d13, [%[b_ptr0], #-0x30]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
                        "ldr d14, [%[b_ptr0], #-0x20]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "ins v12.d[1], temploadreg0\n"
                        "ins v13.d[1], temploadreg1\n"
                        "b.ne 3b\n"
                        "2:\n"
                        "ins v14.d[1], temploadreg2\n"
                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
                        "ldr d15, [%[b_ptr0], #-0x10]\n"
                        "prfm PSTL1KEEP, [c_ptr1]\n"
                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "cbz %[regs], 4f\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "ldr d4, [%[a_ptr0]]\n"
                        "fmla v20.4s, v8.4s, v1.s[0]\n"
                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "ldr d5, [a_ptr1]\n"
                        "fmla v21.4s, v9.4s, v1.s[0]\n"
                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "ldr d8, [%[b_ptr0]]\n"
                        "fmla v22.4s, v10.4s, v1.s[0]\n"
                        "ins v4.d[1], temploadreg0\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                        "fmla v23.4s, v11.4s, v1.s[0]\n"
                        "ldr d9, [%[b_ptr0], #0x10]\n"
                        "fmla v16.4s, v12.4s, v0.s[1]\n"
                        "ins v5.d[1], temploadreg1\n"
                        "fmla v20.4s, v12.4s, v1.s[1]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                        "fmla v17.4s, v13.4s, v0.s[1]\n"
                        "ldr d10, [%[b_ptr0], #0x20]\n"
                        "fmla v21.4s, v13.4s, v1.s[1]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
                        "fmla v18.4s, v14.4s, v0.s[1]\n"
                        "ldr d11, [%[b_ptr0], #0x30]\n"
                        "fmla v22.4s, v14.4s, v1.s[1]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
                        "fmla v19.4s, v15.4s, v0.s[1]\n"
                        "ldr d12, [%[b_ptr0], #0x40]\n"
                        "fmla v23.4s, v15.4s, v1.s[1]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "ldr d13, [%[b_ptr0], #0x50]\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "fmla v16.4s, v8.4s, v0.s[2]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "fmla v20.4s, v8.4s, v1.s[2]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
                        "ldr d14, [%[b_ptr0], #0x60]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "fmla v17.4s, v9.4s, v0.s[2]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                        "fmla v21.4s, v9.4s, v1.s[2]\n"
                        "ldr d15, [%[b_ptr0], #0x70]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "fmla v18.4s, v10.4s, v0.s[2]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
                        "fmla v22.4s, v10.4s, v1.s[2]\n"
                        "ins v12.d[1], temploadreg0\n"
                        "ins v13.d[1], temploadreg1\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v19.4s, v11.4s, v0.s[2]\n"
                        "ldr d8, [%[b_ptr0], #-0x80]\n"
                        "fmla v23.4s, v11.4s, v1.s[2]\n"
                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
                        "fmla v16.4s, v12.4s, v0.s[3]\n"
                        "ldr d9, [%[b_ptr0], #-0x70]\n"
                        "fmla v20.4s, v12.4s, v1.s[3]\n"
                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
                        "fmla v17.4s, v13.4s, v0.s[3]\n"
                        "ldr d10, [%[b_ptr0], #-0x60]\n"
                        "fmla v21.4s, v13.4s, v1.s[3]\n"
                        "ins v14.d[1], temploadreg2\n"
                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
                        "ldr d11, [%[b_ptr0], #-0x50]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "fmla v18.4s, v14.4s, v0.s[3]\n"
                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
                        "fmla v22.4s, v14.4s, v1.s[3]\n"
                        "ldr d12, [%[b_ptr0], #-0x40]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "fmla v19.4s, v15.4s, v0.s[3]\n"
                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
                        "fmla v23.4s, v15.4s, v1.s[3]\n"
                        "ldr d13, [%[b_ptr0], #-0x30]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "fmla v16.4s, v8.4s, v4.s[0]\n"
                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
                        "fmla v20.4s, v8.4s, v5.s[0]\n"
                        "ldr d14, [%[b_ptr0], #-0x20]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "fmla v17.4s, v9.4s, v4.s[0]\n"
                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
                        "fmla v21.4s, v9.4s, v5.s[0]\n"
                        "ldr d15, [%[b_ptr0], #-0x10]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "fmla v18.4s, v10.4s, v4.s[0]\n"
                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
                        "fmla v22.4s, v10.4s, v5.s[0]\n"
                        "ldr d8, [%[b_ptr0]]\n"
                        "ins v12.d[1], temploadreg0\n"
                        "fmla v19.4s, v11.4s, v4.s[0]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                        "fmla v23.4s, v11.4s, v5.s[0]\n"
                        "ldr d9, [%[b_ptr0], #0x10]\n"
                        "ins v13.d[1], temploadreg1\n"
                        "fmla v16.4s, v12.4s, v4.s[1]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                        "fmla v20.4s, v12.4s, v5.s[1]\n"
                        "ldr d10, [%[b_ptr0], #0x20]\n"
                        "ins v14.d[1], temploadreg2\n"
                        "fmla v17.4s, v13.4s, v4.s[1]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
                        "fmla v21.4s, v13.4s, v5.s[1]\n"
                        "ldr d11, [%[b_ptr0], #0x30]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "fmla v18.4s, v14.4s, v4.s[1]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
                        "fmla v22.4s, v14.4s, v5.s[1]\n"
                        "ldr d12, [%[b_ptr0], #0x40]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "fmla v19.4s, v15.4s, v4.s[1]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
                        "fmla v23.4s, v15.4s, v5.s[1]\n"
                        "ldr d13, [%[b_ptr0], #0x50]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "fmla v16.4s, v8.4s, v4.s[2]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
                        "fmla v20.4s, v8.4s, v5.s[2]\n"
                        "ldr d14, [%[b_ptr0], #0x60]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "fmla v17.4s, v9.4s, v4.s[2]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                        "fmla v21.4s, v9.4s, v5.s[2]\n"
                        "ldr d15, [%[b_ptr0], #0x70]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "fmla v18.4s, v10.4s, v4.s[2]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
                        "fmla v22.4s, v10.4s, v5.s[2]\n"
                        "ins v12.d[1], temploadreg0\n"
                        "ins v13.d[1], temploadreg1\n"
                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
                        "fmla v19.4s, v11.4s, v4.s[2]\n"
                        "ins v14.d[1], temploadreg2\n"
                        "fmla v23.4s, v11.4s, v5.s[2]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "fmla v16.4s, v12.4s, v4.s[3]\n"
                        "fmla v20.4s, v12.4s, v5.s[3]\n"
                        "fmla v17.4s, v13.4s, v4.s[3]\n"
                        "fmla v21.4s, v13.4s, v5.s[3]\n"
                        "fmla v18.4s, v14.4s, v4.s[3]\n"
                        "fmla v22.4s, v14.4s, v5.s[3]\n"
                        "fmla v19.4s, v15.4s, v4.s[3]\n"
                        "fmla v23.4s, v15.4s, v5.s[3]\n"
                        "b 5f\n"
                        "4:\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                        "fmla v20.4s, v8.4s, v1.s[0]\n"
                        "ldr d8, [%[b_ptr0]]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                        "fmla v21.4s, v9.4s, v1.s[0]\n"
                        "ldr d9, [%[b_ptr0], #0x10]\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
                        "fmla v22.4s, v10.4s, v1.s[0]\n"
                        "ldr d10, [%[b_ptr0], #0x20]\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
                        "fmla v23.4s, v11.4s, v1.s[0]\n"
                        "ldr d11, [%[b_ptr0], #0x30]\n"
                        "fmla v16.4s, v12.4s, v0.s[1]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "fmla v20.4s, v12.4s, v1.s[1]\n"
                        "ldr d12, [%[b_ptr0], #0x40]\n"
                        "fmla v17.4s, v13.4s, v0.s[1]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
                        "fmla v21.4s, v13.4s, v1.s[1]\n"
                        "ldr d13, [%[b_ptr0], #0x50]\n"
                        "fmla v18.4s, v14.4s, v0.s[1]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "fmla v22.4s, v14.4s, v1.s[1]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
                        "fmla v19.4s, v15.4s, v0.s[1]\n"
                        "ldr d14, [%[b_ptr0], #0x60]\n"
                        "fmla v23.4s, v15.4s, v1.s[1]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "fmla v16.4s, v8.4s, v0.s[2]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                        "fmla v20.4s, v8.4s, v1.s[2]\n"
                        "ldr d15, [%[b_ptr0], #0x70]\n"
                        "fmla v17.4s, v9.4s, v0.s[2]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "fmla v21.4s, v9.4s, v1.s[2]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
                        "fmla v18.4s, v10.4s, v0.s[2]\n"
                        "ins v12.d[1], temploadreg0\n"
                        "fmla v22.4s, v10.4s, v1.s[2]\n"
                        "ins v13.d[1], temploadreg1\n"
                        "fmla v19.4s, v11.4s, v0.s[2]\n"
                        "ins v14.d[1], temploadreg2\n"
                        "fmla v23.4s, v11.4s, v1.s[2]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "fmla v16.4s, v12.4s, v0.s[3]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
                        "fmla v20.4s, v12.4s, v1.s[3]\n"
                        "fmla v17.4s, v13.4s, v0.s[3]\n"
                        "fmla v21.4s, v13.4s, v1.s[3]\n"
                        "fmla v18.4s, v14.4s, v0.s[3]\n"
                        "fmla v22.4s, v14.4s, v1.s[3]\n"
                        "fmla v19.4s, v15.4s, v0.s[3]\n"
                        "fmla v23.4s, v15.4s, v1.s[3]\n"
                        "5:\n"
                        "cbz %[blocks], 6f\n"
                        "7:\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "subs %[blocks], %[blocks], #0x1\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "ldr s0, [%[a_ptr0]]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "ldr s1, [a_ptr1]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "add a_ptr1, a_ptr1, #0x4\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "fmla v20.4s, v8.4s, v1.s[0]\n"
                        "fmla v21.4s, v9.4s, v1.s[0]\n"
                        "fmla v22.4s, v10.4s, v1.s[0]\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "fmla v23.4s, v11.4s, v1.s[0]\n"
                        "b.ne 7b\n"
                        "6:\n"
                        "ld1r {v14.4s}, [%[minptr]]\n"
                        "ld1r {v15.4s}, [%[maxptr]]\n"
                        "fmax v16.4s, v16.4s, v14.4s\n"
                        "fmax v17.4s, v17.4s, v14.4s\n"
                        "fmax v18.4s, v18.4s, v14.4s\n"
                        "fmax v19.4s, v19.4s, v14.4s\n"
                        "fmin v16.4s, v16.4s, v15.4s\n"
                        "fmin v17.4s, v17.4s, v15.4s\n"
                        "fmin v18.4s, v18.4s, v15.4s\n"
                        "fmin v19.4s, v19.4s, v15.4s\n"
                        "str q16, [%[c_ptr0]]\n"
                        "fmax v20.4s, v20.4s, v14.4s\n"
                        "fmax v21.4s, v21.4s, v14.4s\n"
                        "fmax v22.4s, v22.4s, v14.4s\n"
                        "str q17, [%[c_ptr0], #0x10]\n"
                        "fmax v23.4s, v23.4s, v14.4s\n"
                        "fmin v20.4s, v20.4s, v15.4s\n"
                        "fmin v21.4s, v21.4s, v15.4s\n"
                        "str q18, [%[c_ptr0], #0x20]\n"
                        "fmin v22.4s, v22.4s, v15.4s\n"
                        "fmin v23.4s, v23.4s, v15.4s\n"
                        "str q19, [%[c_ptr0], #0x30]\n"
                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
                        "str q20, [c_ptr1]\n"
                        "str q21, [c_ptr1, #0x10]\n"
                        "str q22, [c_ptr1, #0x20]\n"
                        "str q23, [c_ptr1, #0x30]\n"
                        ".unreq a_ptr1\n"
                        ".unreq c_ptr1\n"
                        ".unreq temploadreg0\n"
                        ".unreq temploadreg1\n"
                        ".unreq temploadreg2\n"
                        ".unreq temploadreg3\n"
                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
                    );
                    break;
                case 3:
                    __asm __volatile (
                        "a_ptr1 .req X0\n"
                        "a_ptr2 .req X1\n"
                        "c_ptr1 .req X2\n"
                        "c_ptr2 .req X3\n"
                        "temploadreg0 .req X4\n"
                        "temploadreg1 .req X5\n"
                        "temploadreg2 .req X6\n"
                        "temploadreg3 .req X7\n"
                        "add a_ptr1, %[a_ptr0], %[lda]\n"
                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
                        "add a_ptr2, a_ptr1, %[lda]\n"
                        "add c_ptr2, c_ptr1, %[ldc]\n"
                        "cbnz %[append], 1f\n"
                        "ldr q16, [%[biasptr]]\n"
                        "ldr q17, [%[biasptr], #0x10]\n"
                        "ldr q18, [%[biasptr], #0x20]\n"
                        "ldr q19, [%[biasptr], #0x30]\n"
                        "mov v20.16b, v16.16b\n"
                        "ldr q0, [%[a_ptr0]]\n"
                        "mov v21.16b, v17.16b\n"
                        "ldr q1, [a_ptr1]\n"
                        "mov v22.16b, v18.16b\n"
                        "ldr q2, [a_ptr2]\n"
                        "mov v23.16b, v19.16b\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "mov v24.16b, v16.16b\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "mov v25.16b, v17.16b\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "mov v26.16b, v18.16b\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "mov v27.16b, v19.16b\n"
                        "ldr q12, [%[b_ptr0], #0x40]\n"
                        "ldr q13, [%[b_ptr0], #0x50]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "ldr d14, [%[b_ptr0], #0x60]\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                        "add a_ptr2, a_ptr2, #0x10\n"
                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
                        "cbz %[loops], 2f\n"
                        "b 3f\n"
                        "1:\n"
                        "ldr q16, [%[c_ptr0]]\n"
                        "ldr q17, [%[c_ptr0], #0x10]\n"
                        "ldr q18, [%[c_ptr0], #0x20]\n"
                        "ldr q19, [%[c_ptr0], #0x30]\n"
                        "ldr q20, [c_ptr1]\n"
                        "ldr q21, [c_ptr1, #0x10]\n"
                        "ldr q22, [c_ptr1, #0x20]\n"
                        "ldr q23, [c_ptr1, #0x30]\n"
                        "ldr q24, [c_ptr2]\n"
                        "ldr q25, [c_ptr2, #0x10]\n"
                        "ldr q26, [c_ptr2, #0x20]\n"
                        "ldr q27, [c_ptr2, #0x30]\n"
                        "ldr q0, [%[a_ptr0]]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "ldr q1, [a_ptr1]\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "ldr q2, [a_ptr2]\n"
                        "add a_ptr2, a_ptr2, #0x10\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "ldr q12, [%[b_ptr0], #0x40]\n"
                        "ldr q13, [%[b_ptr0], #0x50]\n"
                        "ldr d14, [%[b_ptr0], #0x60]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
                        "cbz %[loops], 2f\n"
                        "3:\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "ins v14.d[1], temploadreg2\n"
                        "fmla v20.4s, v8.4s, v1.s[0]\n"
                        "ldr d15, [%[b_ptr0], #-0x10]\n"
                        "fmla v24.4s, v8.4s, v2.s[0]\n"
                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "ldr d4, [%[a_ptr0]]\n"
                        "fmla v21.4s, v9.4s, v1.s[0]\n"
                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
                        "fmla v25.4s, v9.4s, v2.s[0]\n"
                        "ldr d5, [a_ptr1]\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
                        "fmla v22.4s, v10.4s, v1.s[0]\n"
                        "ldr d6, [a_ptr2]\n"
                        "fmla v26.4s, v10.4s, v2.s[0]\n"
                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "ldr d8, [%[b_ptr0]]\n"
                        "fmla v23.4s, v11.4s, v1.s[0]\n"
                        "ins v4.d[1], temploadreg0\n"
                        "fmla v27.4s, v11.4s, v2.s[0]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                        "fmla v16.4s, v12.4s, v0.s[1]\n"
                        "ldr d9, [%[b_ptr0], #0x10]\n"
                        "fmla v20.4s, v12.4s, v1.s[1]\n"
                        "ins v5.d[1], temploadreg1\n"
                        "fmla v24.4s, v12.4s, v2.s[1]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                        "fmla v17.4s, v13.4s, v0.s[1]\n"
                        "ldr d10, [%[b_ptr0], #0x20]\n"
                        "fmla v21.4s, v13.4s, v1.s[1]\n"
                        "ins v6.d[1], temploadreg2\n"
                        "fmla v25.4s, v13.4s, v2.s[1]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
                        "fmla v18.4s, v14.4s, v0.s[1]\n"
                        "ldr d11, [%[b_ptr0], #0x30]\n"
                        "fmla v22.4s, v14.4s, v1.s[1]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "fmla v26.4s, v14.4s, v2.s[1]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
                        "ldr d12, [%[b_ptr0], #0x40]\n"
                        "subs %[loops], %[loops], #0x1\n"
                        "fmla v19.4s, v15.4s, v0.s[1]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "fmla v23.4s, v15.4s, v1.s[1]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
                        "fmla v27.4s, v15.4s, v2.s[1]\n"
                        "ldr d13, [%[b_ptr0], #0x50]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
                        "fmla v16.4s, v8.4s, v0.s[2]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
                        "fmla v20.4s, v8.4s, v1.s[2]\n"
                        "ldr d14, [%[b_ptr0], #0x60]\n"
                        "fmla v24.4s, v8.4s, v2.s[2]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "fmla v17.4s, v9.4s, v0.s[2]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                        "fmla v21.4s, v9.4s, v1.s[2]\n"
                        "ldr d15, [%[b_ptr0], #0x70]\n"
                        "fmla v25.4s, v9.4s, v2.s[2]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "fmla v18.4s, v10.4s, v0.s[2]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
                        "fmla v22.4s, v10.4s, v1.s[2]\n"
                        "ins v12.d[1], temploadreg0\n"
                        "fmla v26.4s, v10.4s, v2.s[2]\n"
                        "ins v13.d[1], temploadreg1\n"
                        "fmla v19.4s, v11.4s, v0.s[2]\n"
                        "ins v14.d[1], temploadreg2\n"
                        "fmla v23.4s, v11.4s, v1.s[2]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "fmla v27.4s, v11.4s, v2.s[2]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v16.4s, v12.4s, v0.s[3]\n"
                        "ldr d8, [%[b_ptr0], #-0x80]\n"
                        "fmla v20.4s, v12.4s, v1.s[3]\n"
                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
                        "fmla v24.4s, v12.4s, v2.s[3]\n"
                        "ldr d9, [%[b_ptr0], #-0x70]\n"
                        "fmla v17.4s, v13.4s, v0.s[3]\n"
                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
                        "fmla v21.4s, v13.4s, v1.s[3]\n"
                        "ldr d10, [%[b_ptr0], #-0x60]\n"
                        "fmla v25.4s, v13.4s, v2.s[3]\n"
                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
                        "fmla v18.4s, v14.4s, v0.s[3]\n"
                        "ldr d11, [%[b_ptr0], #-0x50]\n"
                        "fmla v22.4s, v14.4s, v1.s[3]\n"
                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
                        "fmla v26.4s, v14.4s, v2.s[3]\n"
                        "ldr d12, [%[b_ptr0], #-0x40]\n"
                        "fmla v19.4s, v15.4s, v0.s[3]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "fmla v23.4s, v15.4s, v1.s[3]\n"
                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
                        "fmla v27.4s, v15.4s, v2.s[3]\n"
                        "ldr d13, [%[b_ptr0], #-0x30]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                        "fmla v16.4s, v8.4s, v4.s[0]\n"
                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
                        "fmla v20.4s, v8.4s, v5.s[0]\n"
                        "ldr d14, [%[b_ptr0], #-0x20]\n"
                        "fmla v24.4s, v8.4s, v6.s[0]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "fmla v17.4s, v9.4s, v4.s[0]\n"
                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
                        "fmla v21.4s, v9.4s, v5.s[0]\n"
                        "ldr d15, [%[b_ptr0], #-0x10]\n"
                        "fmla v25.4s, v9.4s, v6.s[0]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "fmla v18.4s, v10.4s, v4.s[0]\n"
                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
                        "fmla v22.4s, v10.4s, v5.s[0]\n"
                        "ldr d0, [%[a_ptr0], #-0x10]\n"
                        "fmla v26.4s, v10.4s, v6.s[0]\n"
                        "ins v12.d[1], temploadreg0\n"
                        "fmla v19.4s, v11.4s, v4.s[0]\n"
                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
                        "fmla v23.4s, v11.4s, v5.s[0]\n"
                        "ins v13.d[1], temploadreg1\n"
                        "fmla v27.4s, v11.4s, v6.s[0]\n"
                        "ins v14.d[1], temploadreg2\n"
                        "fmla v16.4s, v12.4s, v4.s[1]\n"
                        "ldr d8, [%[b_ptr0]]\n"
                        "fmla v20.4s, v12.4s, v5.s[1]\n"
                        "ins v0.d[1], temploadreg0\n"
                        "fmla v24.4s, v12.4s, v6.s[1]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                        "fmla v17.4s, v13.4s, v4.s[1]\n"
                        "ldr d9, [%[b_ptr0], #0x10]\n"
                        "fmla v21.4s, v13.4s, v5.s[1]\n"
                        "ldr d10, [%[b_ptr0], #0x20]\n"
                        "fmla v25.4s, v13.4s, v6.s[1]\n"
                        "ldr d11, [%[b_ptr0], #0x30]\n"
                        "fmla v18.4s, v14.4s, v4.s[1]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "fmla v22.4s, v14.4s, v5.s[1]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
                        "fmla v26.4s, v14.4s, v6.s[1]\n"
                        "ldr d12, [%[b_ptr0], #0x40]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "add a_ptr1, a_ptr1, #0x20\n"
                        "fmla v19.4s, v15.4s, v4.s[1]\n"
                        "ldr d1, [a_ptr1, #-0x10]\n"
                        "fmla v23.4s, v15.4s, v5.s[1]\n"
                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
                        "fmla v27.4s, v15.4s, v6.s[1]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
                        "fmla v16.4s, v8.4s, v4.s[2]\n"
                        "ldr d13, [%[b_ptr0], #0x50]\n"
                        "fmla v20.4s, v8.4s, v5.s[2]\n"
                        "ins v1.d[1], temploadreg1\n"
                        "fmla v24.4s, v8.4s, v6.s[2]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                        "ldr d14, [%[b_ptr0], #0x60]\n"
                        "add a_ptr2, a_ptr2, #0x20\n"
                        "ldr d15, [%[b_ptr0], #0x70]\n"
                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
                        "ldr d2, [a_ptr2, #-0x10]\n"
                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
                        "ldr temploadreg2, [a_ptr2, #-0x8]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "ins v2.d[1], temploadreg2\n"
                        "fmla v17.4s, v9.4s, v4.s[2]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
                        "fmla v21.4s, v9.4s, v5.s[2]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
                        "fmla v25.4s, v9.4s, v6.s[2]\n"
                        "ins v12.d[1], temploadreg0\n"
                        "fmla v19.4s, v11.4s, v4.s[2]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "fmla v23.4s, v11.4s, v5.s[2]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                        "fmla v27.4s, v11.4s, v6.s[2]\n"
                        "ins v13.d[1], temploadreg1\n"
                        "fmla v16.4s, v12.4s, v4.s[3]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "fmla v18.4s, v10.4s, v4.s[2]\n"
                        "ins v14.d[1], temploadreg2\n"
                        "fmla v22.4s, v10.4s, v5.s[2]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v26.4s, v10.4s, v6.s[2]\n"
                        "ldr d8, [%[b_ptr0], #-0x80]\n"
                        "fmla v20.4s, v12.4s, v5.s[3]\n"
                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
                        "fmla v24.4s, v12.4s, v6.s[3]\n"
                        "ldr d9, [%[b_ptr0], #-0x70]\n"
                        "fmla v17.4s, v13.4s, v4.s[3]\n"
                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
                        "fmla v21.4s, v13.4s, v5.s[3]\n"
                        "ldr d10, [%[b_ptr0], #-0x60]\n"
                        "fmla v25.4s, v13.4s, v6.s[3]\n"
                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
                        "fmla v18.4s, v14.4s, v4.s[3]\n"
                        "ldr d11, [%[b_ptr0], #-0x50]\n"
                        "fmla v22.4s, v14.4s, v5.s[3]\n"
                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
                        "fmla v26.4s, v14.4s, v6.s[3]\n"
                        "ldr d12, [%[b_ptr0], #-0x40]\n"
                        "fmla v19.4s, v15.4s, v4.s[3]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "fmla v23.4s, v15.4s, v5.s[3]\n"
                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
                        "fmla v27.4s, v15.4s, v6.s[3]\n"
                        "ldr d13, [%[b_ptr0], #-0x30]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
                        "ldr d14, [%[b_ptr0], #-0x20]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "ins v12.d[1], temploadreg0\n"
                        "ins v13.d[1], temploadreg1\n"
                        "b.ne 3b\n"
                        "2:\n"
                        "ins v14.d[1], temploadreg2\n"
                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
                        "ldr d15, [%[b_ptr0], #-0x10]\n"
                        "prfm PSTL1KEEP, [c_ptr1]\n"
                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
                        "prfm PSTL1KEEP, [c_ptr2]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "cbz %[regs], 4f\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "ldr d4, [%[a_ptr0]]\n"
                        "fmla v20.4s, v8.4s, v1.s[0]\n"
                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
                        "fmla v24.4s, v8.4s, v2.s[0]\n"
                        "ldr d5, [a_ptr1]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
                        "fmla v21.4s, v9.4s, v1.s[0]\n"
                        "ldr d6, [a_ptr2]\n"
                        "fmla v25.4s, v9.4s, v2.s[0]\n"
                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "ldr d8, [%[b_ptr0]]\n"
                        "fmla v22.4s, v10.4s, v1.s[0]\n"
                        "ins v4.d[1], temploadreg0\n"
                        "fmla v26.4s, v10.4s, v2.s[0]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "ldr d9, [%[b_ptr0], #0x10]\n"
                        "fmla v23.4s, v11.4s, v1.s[0]\n"
                        "ins v5.d[1], temploadreg1\n"
                        "fmla v27.4s, v11.4s, v2.s[0]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                        "fmla v16.4s, v12.4s, v0.s[1]\n"
                        "ldr d10, [%[b_ptr0], #0x20]\n"
                        "fmla v20.4s, v12.4s, v1.s[1]\n"
                        "ins v6.d[1], temploadreg2\n"
                        "fmla v24.4s, v12.4s, v2.s[1]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
                        "fmla v17.4s, v13.4s, v0.s[1]\n"
                        "ldr d11, [%[b_ptr0], #0x30]\n"
                        "fmla v21.4s, v13.4s, v1.s[1]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
                        "fmla v25.4s, v13.4s, v2.s[1]\n"
                        "ldr d12, [%[b_ptr0], #0x40]\n"
                        "fmla v18.4s, v14.4s, v0.s[1]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "fmla v22.4s, v14.4s, v1.s[1]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
                        "fmla v26.4s, v14.4s, v2.s[1]\n"
                        "ldr d13, [%[b_ptr0], #0x50]\n"
                        "fmla v19.4s, v15.4s, v0.s[1]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "fmla v23.4s, v15.4s, v1.s[1]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
                        "fmla v27.4s, v15.4s, v2.s[1]\n"
                        "ldr d14, [%[b_ptr0], #0x60]\n"
                        "fmla v16.4s, v8.4s, v0.s[2]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "fmla v20.4s, v8.4s, v1.s[2]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                        "fmla v24.4s, v8.4s, v2.s[2]\n"
                        "ldr d15, [%[b_ptr0], #0x70]\n"
                        "fmla v17.4s, v9.4s, v0.s[2]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "fmla v21.4s, v9.4s, v1.s[2]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
                        "fmla v25.4s, v9.4s, v2.s[2]\n"
                        "ins v12.d[1], temploadreg0\n"
                        "fmla v18.4s, v10.4s, v0.s[2]\n"
                        "ins v13.d[1], temploadreg1\n"
                        "fmla v22.4s, v10.4s, v1.s[2]\n"
                        "ins v14.d[1], temploadreg2\n"
                        "fmla v26.4s, v10.4s, v2.s[2]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "fmla v19.4s, v11.4s, v0.s[2]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v23.4s, v11.4s, v1.s[2]\n"
                        "ldr d8, [%[b_ptr0], #-0x80]\n"
                        "fmla v27.4s, v11.4s, v2.s[2]\n"
                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
                        "fmla v16.4s, v12.4s, v0.s[3]\n"
                        "ldr d9, [%[b_ptr0], #-0x70]\n"
                        "fmla v20.4s, v12.4s, v1.s[3]\n"
                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
                        "fmla v24.4s, v12.4s, v2.s[3]\n"
                        "ldr d10, [%[b_ptr0], #-0x60]\n"
                        "fmla v17.4s, v13.4s, v0.s[3]\n"
                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
                        "fmla v21.4s, v13.4s, v1.s[3]\n"
                        "ldr d11, [%[b_ptr0], #-0x50]\n"
                        "fmla v25.4s, v13.4s, v2.s[3]\n"
                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
                        "fmla v18.4s, v14.4s, v0.s[3]\n"
                        "ldr d12, [%[b_ptr0], #-0x40]\n"
                        "fmla v22.4s, v14.4s, v1.s[3]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "fmla v26.4s, v14.4s, v2.s[3]\n"
                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
                        "fmla v19.4s, v15.4s, v0.s[3]\n"
                        "ldr d13, [%[b_ptr0], #-0x30]\n"
                        "fmla v23.4s, v15.4s, v1.s[3]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "fmla v27.4s, v15.4s, v2.s[3]\n"
                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
                        "fmla v16.4s, v8.4s, v4.s[0]\n"
                        "ldr d14, [%[b_ptr0], #-0x20]\n"
                        "fmla v20.4s, v8.4s, v5.s[0]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "fmla v24.4s, v8.4s, v6.s[0]\n"
                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
                        "fmla v17.4s, v9.4s, v4.s[0]\n"
                        "ldr d15, [%[b_ptr0], #-0x10]\n"
                        "fmla v21.4s, v9.4s, v5.s[0]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "fmla v25.4s, v9.4s, v6.s[0]\n"
                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
                        "fmla v18.4s, v10.4s, v4.s[0]\n"
                        "ldr d8, [%[b_ptr0]]\n"
                        "fmla v22.4s, v10.4s, v5.s[0]\n"
                        "ins v12.d[1], temploadreg0\n"
                        "fmla v26.4s, v10.4s, v6.s[0]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                        "fmla v19.4s, v11.4s, v4.s[0]\n"
                        "ldr d9, [%[b_ptr0], #0x10]\n"
                        "fmla v23.4s, v11.4s, v5.s[0]\n"
                        "ins v13.d[1], temploadreg1\n"
                        "fmla v27.4s, v11.4s, v6.s[0]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                        "fmla v16.4s, v12.4s, v4.s[1]\n"
                        "ldr d10, [%[b_ptr0], #0x20]\n"
                        "fmla v20.4s, v12.4s, v5.s[1]\n"
                        "ins v14.d[1], temploadreg2\n"
                        "fmla v24.4s, v12.4s, v6.s[1]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
                        "fmla v17.4s, v13.4s, v4.s[1]\n"
                        "ldr d11, [%[b_ptr0], #0x30]\n"
                        "fmla v21.4s, v13.4s, v5.s[1]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "fmla v25.4s, v13.4s, v6.s[1]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
                        "fmla v18.4s, v14.4s, v4.s[1]\n"
                        "ldr d12, [%[b_ptr0], #0x40]\n"
                        "fmla v22.4s, v14.4s, v5.s[1]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "fmla v26.4s, v14.4s, v6.s[1]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
                        "fmla v19.4s, v15.4s, v4.s[1]\n"
                        "ldr d13, [%[b_ptr0], #0x50]\n"
                        "fmla v23.4s, v15.4s, v5.s[1]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "fmla v27.4s, v15.4s, v6.s[1]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
                        "fmla v16.4s, v8.4s, v4.s[2]\n"
                        "ldr d14, [%[b_ptr0], #0x60]\n"
                        "fmla v20.4s, v8.4s, v5.s[2]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "fmla v24.4s, v8.4s, v6.s[2]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                        "fmla v17.4s, v9.4s, v4.s[2]\n"
                        "ldr d15, [%[b_ptr0], #0x70]\n"
                        "fmla v21.4s, v9.4s, v5.s[2]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "fmla v25.4s, v9.4s, v6.s[2]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
                        "fmla v18.4s, v10.4s, v4.s[2]\n"
                        "ins v12.d[1], temploadreg0\n"
                        "fmla v22.4s, v10.4s, v5.s[2]\n"
                        "ins v13.d[1], temploadreg1\n"
                        "fmla v26.4s, v10.4s, v6.s[2]\n"
                        "ins v14.d[1], temploadreg2\n"
                        "fmla v19.4s, v11.4s, v4.s[2]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "fmla v23.4s, v11.4s, v5.s[2]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
                        "fmla v27.4s, v11.4s, v6.s[2]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "fmla v16.4s, v12.4s, v4.s[3]\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "fmla v20.4s, v12.4s, v5.s[3]\n"
                        "add a_ptr2, a_ptr2, #0x10\n"
                        "fmla v24.4s, v12.4s, v6.s[3]\n"
                        "fmla v17.4s, v13.4s, v4.s[3]\n"
                        "fmla v21.4s, v13.4s, v5.s[3]\n"
                        "fmla v25.4s, v13.4s, v6.s[3]\n"
                        "fmla v18.4s, v14.4s, v4.s[3]\n"
                        "fmla v22.4s, v14.4s, v5.s[3]\n"
                        "fmla v26.4s, v14.4s, v6.s[3]\n"
                        "fmla v19.4s, v15.4s, v4.s[3]\n"
                        "fmla v23.4s, v15.4s, v5.s[3]\n"
                        "fmla v27.4s, v15.4s, v6.s[3]\n"
                        "b 5f\n"
                        "4:\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                        "fmla v20.4s, v8.4s, v1.s[0]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                        "fmla v24.4s, v8.4s, v2.s[0]\n"
                        "ldr d8, [%[b_ptr0]]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
                        "fmla v21.4s, v9.4s, v1.s[0]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
                        "fmla v25.4s, v9.4s, v2.s[0]\n"
                        "ldr d9, [%[b_ptr0], #0x10]\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "fmla v22.4s, v10.4s, v1.s[0]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
                        "fmla v26.4s, v10.4s, v2.s[0]\n"
                        "ldr d10, [%[b_ptr0], #0x20]\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "fmla v23.4s, v11.4s, v1.s[0]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
                        "fmla v27.4s, v11.4s, v2.s[0]\n"
                        "ldr d11, [%[b_ptr0], #0x30]\n"
                        "fmla v16.4s, v12.4s, v0.s[1]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "fmla v20.4s, v12.4s, v1.s[1]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                        "fmla v24.4s, v12.4s, v2.s[1]\n"
                        "ldr d12, [%[b_ptr0], #0x40]\n"
                        "fmla v17.4s, v13.4s, v0.s[1]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "fmla v21.4s, v13.4s, v1.s[1]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
                        "fmla v25.4s, v13.4s, v2.s[1]\n"
                        "ldr d13, [%[b_ptr0], #0x50]\n"
                        "fmla v18.4s, v14.4s, v0.s[1]\n"
                        "ins v12.d[1], temploadreg0\n"
                        "fmla v22.4s, v14.4s, v1.s[1]\n"
                        "fmla v26.4s, v14.4s, v2.s[1]\n"
                        "ldr d14, [%[b_ptr0], #0x60]\n"
                        "fmla v19.4s, v15.4s, v0.s[1]\n"
                        "ins v13.d[1], temploadreg1\n"
                        "fmla v23.4s, v15.4s, v1.s[1]\n"
                        "fmla v27.4s, v15.4s, v2.s[1]\n"
                        "ldr d15, [%[b_ptr0], #0x70]\n"
                        "fmla v16.4s, v8.4s, v0.s[2]\n"
                        "ins v14.d[1], temploadreg2\n"
                        "fmla v20.4s, v8.4s, v1.s[2]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
                        "fmla v24.4s, v8.4s, v2.s[2]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "fmla v17.4s, v9.4s, v0.s[2]\n"
                        "fmla v21.4s, v9.4s, v1.s[2]\n"
                        "fmla v25.4s, v9.4s, v2.s[2]\n"
                        "fmla v18.4s, v10.4s, v0.s[2]\n"
                        "fmla v22.4s, v10.4s, v1.s[2]\n"
                        "fmla v26.4s, v10.4s, v2.s[2]\n"
                        "fmla v19.4s, v11.4s, v0.s[2]\n"
                        "fmla v23.4s, v11.4s, v1.s[2]\n"
                        "fmla v27.4s, v11.4s, v2.s[2]\n"
                        "fmla v16.4s, v12.4s, v0.s[3]\n"
                        "fmla v20.4s, v12.4s, v1.s[3]\n"
                        "fmla v24.4s, v12.4s, v2.s[3]\n"
                        "fmla v17.4s, v13.4s, v0.s[3]\n"
                        "fmla v21.4s, v13.4s, v1.s[3]\n"
                        "fmla v25.4s, v13.4s, v2.s[3]\n"
                        "fmla v18.4s, v14.4s, v0.s[3]\n"
                        "fmla v22.4s, v14.4s, v1.s[3]\n"
                        "fmla v26.4s, v14.4s, v2.s[3]\n"
                        "fmla v19.4s, v15.4s, v0.s[3]\n"
                        "fmla v23.4s, v15.4s, v1.s[3]\n"
                        "fmla v27.4s, v15.4s, v2.s[3]\n"
                        "5:\n"
                        "cbz %[blocks], 6f\n"
                        "7:\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "subs %[blocks], %[blocks], #0x1\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "ldr s0, [%[a_ptr0]]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "ldr s1, [a_ptr1]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "add a_ptr1, a_ptr1, #0x4\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "ldr s2, [a_ptr2]\n"
                        "fmla v20.4s, v8.4s, v1.s[0]\n"
                        "add a_ptr2, a_ptr2, #0x4\n"
                        "fmla v21.4s, v9.4s, v1.s[0]\n"
                        "fmla v24.4s, v8.4s, v2.s[0]\n"
                        "fmla v25.4s, v9.4s, v2.s[0]\n"
                        "fmla v22.4s, v10.4s, v1.s[0]\n"
                        "fmla v26.4s, v10.4s, v2.s[0]\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "fmla v23.4s, v11.4s, v1.s[0]\n"
                        "fmla v27.4s, v11.4s, v2.s[0]\n"
                        "b.ne 7b\n"
                        "6:\n"
                        "ld1r {v14.4s}, [%[minptr]]\n"
                        "ld1r {v15.4s}, [%[maxptr]]\n"
                        "fmax v16.4s, v16.4s, v14.4s\n"
                        "fmax v17.4s, v17.4s, v14.4s\n"
                        "fmax v18.4s, v18.4s, v14.4s\n"
                        "fmax v19.4s, v19.4s, v14.4s\n"
                        "fmin v16.4s, v16.4s, v15.4s\n"
                        "fmin v17.4s, v17.4s, v15.4s\n"
                        "fmin v18.4s, v18.4s, v15.4s\n"
                        "fmin v19.4s, v19.4s, v15.4s\n"
                        "str q16, [%[c_ptr0]]\n"
                        "fmax v20.4s, v20.4s, v14.4s\n"
                        "fmax v21.4s, v21.4s, v14.4s\n"
                        "fmax v22.4s, v22.4s, v14.4s\n"
                        "str q17, [%[c_ptr0], #0x10]\n"
                        "fmax v23.4s, v23.4s, v14.4s\n"
                        "fmin v20.4s, v20.4s, v15.4s\n"
                        "fmin v21.4s, v21.4s, v15.4s\n"
                        "str q18, [%[c_ptr0], #0x20]\n"
                        "fmin v22.4s, v22.4s, v15.4s\n"
                        "fmin v23.4s, v23.4s, v15.4s\n"
                        "fmax v24.4s, v24.4s, v14.4s\n"
                        "str q19, [%[c_ptr0], #0x30]\n"
                        "fmax v25.4s, v25.4s, v14.4s\n"
                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
                        "fmax v26.4s, v26.4s, v14.4s\n"
                        "str q20, [c_ptr1]\n"
                        "fmin v24.4s, v24.4s, v15.4s\n"
                        "fmin v25.4s, v25.4s, v15.4s\n"
                        "fmax v27.4s, v27.4s, v14.4s\n"
                        "str q21, [c_ptr1, #0x10]\n"
                        "fmin v26.4s, v26.4s, v15.4s\n"
                        "fmin v27.4s, v27.4s, v15.4s\n"
                        "str q22, [c_ptr1, #0x20]\n"
                        "str q23, [c_ptr1, #0x30]\n"
                        "str q24, [c_ptr2]\n"
                        "str q25, [c_ptr2, #0x10]\n"
                        "str q26, [c_ptr2, #0x20]\n"
                        "str q27, [c_ptr2, #0x30]\n"
                        ".unreq a_ptr1\n"
                        ".unreq a_ptr2\n"
                        ".unreq c_ptr1\n"
                        ".unreq c_ptr2\n"
                        ".unreq temploadreg0\n"
                        ".unreq temploadreg1\n"
                        ".unreq temploadreg2\n"
                        ".unreq temploadreg3\n"
                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
                    );
                    break;
                default:
                case 4:
                    __asm __volatile (
                        "a_ptr1 .req X0\n"
                        "a_ptr2 .req X1\n"
                        "a_ptr3 .req X2\n"
                        "c_ptr1 .req X3\n"
                        "c_ptr2 .req X4\n"
                        "c_ptr3 .req X5\n"
                        "temploadreg0 .req X6\n"
                        "temploadreg1 .req X7\n"
                        "temploadreg2 .req X8\n"
                        "temploadreg3 .req X9\n"
                        "add a_ptr1, %[a_ptr0], %[lda]\n"
                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
                        "add a_ptr2, a_ptr1, %[lda]\n"
                        "add c_ptr2, c_ptr1, %[ldc]\n"
                        "add a_ptr3, a_ptr2, %[lda]\n"
                        "add c_ptr3, c_ptr2, %[ldc]\n"
                        "cbnz %[append], 1f\n"
                        "ldr q16, [%[biasptr]]\n"
                        "ldr q17, [%[biasptr], #0x10]\n"
                        "ldr q18, [%[biasptr], #0x20]\n"
                        "ldr q19, [%[biasptr], #0x30]\n"
                        "mov v20.16b, v16.16b\n"
                        "ldr q0, [%[a_ptr0]]\n"
                        "mov v21.16b, v17.16b\n"
                        "ldr q1, [a_ptr1]\n"
                        "mov v22.16b, v18.16b\n"
                        "ldr q2, [a_ptr2]\n"
                        "mov v23.16b, v19.16b\n"
                        "ldr q3, [a_ptr3]\n"
                        "mov v24.16b, v16.16b\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "mov v25.16b, v17.16b\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "mov v26.16b, v18.16b\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "mov v27.16b, v19.16b\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "mov v28.16b, v16.16b\n"
                        "ldr q12, [%[b_ptr0], #0x40]\n"
                        "mov v29.16b, v17.16b\n"
                        "ldr q13, [%[b_ptr0], #0x50]\n"
                        "mov v30.16b, v18.16b\n"
                        "ldr d14, [%[b_ptr0], #0x60]\n"
                        "mov v31.16b, v19.16b\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "add a_ptr2, a_ptr2, #0x10\n"
                        "ins v14.d[1], temploadreg2\n"
                        "add a_ptr3, a_ptr3, #0x10\n"
                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
                        "cbz %[loops], 2f\n"
                        "b 3f\n"
                        "1:\n"
                        "ldr q16, [%[c_ptr0]]\n"
                        "ldr q17, [%[c_ptr0], #0x10]\n"
                        "ldr q18, [%[c_ptr0], #0x20]\n"
                        "ldr q19, [%[c_ptr0], #0x30]\n"
                        "ldr q20, [c_ptr1]\n"
                        "ldr q21, [c_ptr1, #0x10]\n"
                        "ldr q22, [c_ptr1, #0x20]\n"
                        "ldr q23, [c_ptr1, #0x30]\n"
                        "ldr q24, [c_ptr2]\n"
                        "ldr q25, [c_ptr2, #0x10]\n"
                        "ldr q26, [c_ptr2, #0x20]\n"
                        "ldr q27, [c_ptr2, #0x30]\n"
                        "ldr q28, [c_ptr3]\n"
                        "ldr q29, [c_ptr3, #0x10]\n"
                        "ldr q30, [c_ptr3, #0x20]\n"
                        "ldr q31, [c_ptr3, #0x30]\n"
                        "ldr q0, [%[a_ptr0]]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "ldr q1, [a_ptr1]\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "ldr q2, [a_ptr2]\n"
                        "add a_ptr2, a_ptr2, #0x10\n"
                        "ldr q3, [a_ptr3]\n"
                        "add a_ptr3, a_ptr3, #0x10\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "ldr q12, [%[b_ptr0], #0x40]\n"
                        "ldr q13, [%[b_ptr0], #0x50]\n"
                        "ldr d14, [%[b_ptr0], #0x60]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
                        "ins v14.d[1], temploadreg2\n"
                        "cbz %[loops], 2f\n"
                        "3:\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "ldr d15, [%[b_ptr0], #-0x10]\n"
                        "fmla v20.4s, v8.4s, v1.s[0]\n"
                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
                        "fmla v24.4s, v8.4s, v2.s[0]\n"
                        "ldr d4, [%[a_ptr0]]\n"
                        "fmla v28.4s, v8.4s, v3.s[0]\n"
                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "ldr d5, [a_ptr1]\n"
                        "fmla v21.4s, v9.4s, v1.s[0]\n"
                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
                        "fmla v25.4s, v9.4s, v2.s[0]\n"
                        "ldr d6, [a_ptr2]\n"
                        "fmla v29.4s, v9.4s, v3.s[0]\n"
                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "ldr d7, [a_ptr3]\n"
                        "fmla v22.4s, v10.4s, v1.s[0]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "fmla v26.4s, v10.4s, v2.s[0]\n"
                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
                        "fmla v30.4s, v10.4s, v3.s[0]\n"
                        "ldr d8, [%[b_ptr0]]\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "ins v4.d[1], temploadreg0\n"
                        "fmla v23.4s, v11.4s, v1.s[0]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                        "fmla v27.4s, v11.4s, v2.s[0]\n"
                        "ldr d9, [%[b_ptr0], #0x10]\n"
                        "fmla v31.4s, v11.4s, v3.s[0]\n"
                        "ins v5.d[1], temploadreg1\n"
                        "fmla v16.4s, v12.4s, v0.s[1]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                        "fmla v20.4s, v12.4s, v1.s[1]\n"
                        "ldr d10, [%[b_ptr0], #0x20]\n"
                        "fmla v24.4s, v12.4s, v2.s[1]\n"
                        "ins v6.d[1], temploadreg2\n"
                        "fmla v28.4s, v12.4s, v3.s[1]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
                        "fmla v17.4s, v13.4s, v0.s[1]\n"
                        "ldr d11, [%[b_ptr0], #0x30]\n"
                        "fmla v21.4s, v13.4s, v1.s[1]\n"
                        "ins v7.d[1], temploadreg3\n"
                        "fmla v25.4s, v13.4s, v2.s[1]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
                        "fmla v29.4s, v13.4s, v3.s[1]\n"
                        "ldr d12, [%[b_ptr0], #0x40]\n"
                        "fmla v18.4s, v14.4s, v0.s[1]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "fmla v22.4s, v14.4s, v1.s[1]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
                        "fmla v26.4s, v14.4s, v2.s[1]\n"
                        "ldr d13, [%[b_ptr0], #0x50]\n"
                        "fmla v30.4s, v14.4s, v3.s[1]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "fmla v19.4s, v15.4s, v0.s[1]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
                        "fmla v23.4s, v15.4s, v1.s[1]\n"
                        "ldr d14, [%[b_ptr0], #0x60]\n"
                        "fmla v27.4s, v15.4s, v2.s[1]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "fmla v31.4s, v15.4s, v3.s[1]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                        "fmla v16.4s, v8.4s, v0.s[2]\n"
                        "ldr d15, [%[b_ptr0], #0x70]\n"
                        "fmla v20.4s, v8.4s, v1.s[2]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "fmla v24.4s, v8.4s, v2.s[2]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
                        "fmla v28.4s, v8.4s, v3.s[2]\n"
                        "ins v12.d[1], temploadreg0\n"
                        "fmla v17.4s, v9.4s, v0.s[2]\n"
                        "ins v13.d[1], temploadreg1\n"
                        "fmla v21.4s, v9.4s, v1.s[2]\n"
                        "ins v14.d[1], temploadreg2\n"
                        "fmla v25.4s, v9.4s, v2.s[2]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "fmla v29.4s, v9.4s, v3.s[2]\n"
                        "subs %[loops], %[loops], #0x1\n"
                        "fmla v18.4s, v10.4s, v0.s[2]\n"
                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
                        "fmla v22.4s, v10.4s, v1.s[2]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v26.4s, v10.4s, v2.s[2]\n"
                        "ldr d8, [%[b_ptr0], #-0x80]\n"
                        "fmla v30.4s, v10.4s, v3.s[2]\n"
                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
                        "fmla v19.4s, v11.4s, v0.s[2]\n"
                        "ldr d9, [%[b_ptr0], #-0x70]\n"
                        "fmla v23.4s, v11.4s, v1.s[2]\n"
                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
                        "fmla v27.4s, v11.4s, v2.s[2]\n"
                        "ldr d10, [%[b_ptr0], #-0x60]\n"
                        "fmla v31.4s, v11.4s, v3.s[2]\n"
                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
                        "fmla v16.4s, v12.4s, v0.s[3]\n"
                        "ldr d11, [%[b_ptr0], #-0x50]\n"
                        "fmla v20.4s, v12.4s, v1.s[3]\n"
                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
                        "fmla v24.4s, v12.4s, v2.s[3]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "fmla v28.4s, v12.4s, v3.s[3]\n"
                        "ldr d12, [%[b_ptr0], #-0x40]\n"
                        "fmla v17.4s, v13.4s, v0.s[3]\n"
                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
                        "fmla v21.4s, v13.4s, v1.s[3]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "fmla v25.4s, v13.4s, v2.s[3]\n"
                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
                        "fmla v29.4s, v13.4s, v3.s[3]\n"
                        "ldr d13, [%[b_ptr0], #-0x30]\n"
                        "fmla v18.4s, v14.4s, v0.s[3]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "fmla v22.4s, v14.4s, v1.s[3]\n"
                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
                        "fmla v26.4s, v14.4s, v2.s[3]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "fmla v30.4s, v14.4s, v3.s[3]\n"
                        "ldr d14, [%[b_ptr0], #-0x20]\n"
                        "fmla v19.4s, v15.4s, v0.s[3]\n"
                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
                        "fmla v23.4s, v15.4s, v1.s[3]\n"
                        "ins v12.d[1], temploadreg0\n"
                        "fmla v27.4s, v15.4s, v2.s[3]\n"
                        "ins v13.d[1], temploadreg1\n"
                        "fmla v31.4s, v15.4s, v3.s[3]\n"
                        "ldr d15, [%[b_ptr0], #-0x10]\n"
                        "fmla v16.4s, v8.4s, v4.s[0]\n"
                        "ins v14.d[1], temploadreg2\n"
                        "fmla v20.4s, v8.4s, v5.s[0]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
                        "fmla v24.4s, v8.4s, v6.s[0]\n"
                        "ldr d0, [%[a_ptr0], #-0x10]\n"
                        "fmla v28.4s, v8.4s, v7.s[0]\n"
                        "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
                        "fmla v17.4s, v9.4s, v4.s[0]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "fmla v21.4s, v9.4s, v5.s[0]\n"
                        "ldr d8, [%[b_ptr0]]\n"
                        "fmla v25.4s, v9.4s, v6.s[0]\n"
                        "ins v0.d[1], temploadreg0\n"
                        "fmla v29.4s, v9.4s, v7.s[0]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                        "fmla v18.4s, v10.4s, v4.s[0]\n"
                        "ldr d9, [%[b_ptr0], #0x10]\n"
                        "fmla v22.4s, v10.4s, v5.s[0]\n"
                        "add a_ptr1, a_ptr1, #0x20\n"
                        "fmla v26.4s, v10.4s, v6.s[0]\n"
                        "ldr d1, [a_ptr1, #-0x10]\n"
                        "fmla v30.4s, v10.4s, v7.s[0]\n"
                        "ldr temploadreg1, [a_ptr1, #-0x8]\n"
                        "fmla v19.4s, v11.4s, v4.s[0]\n"
                        "ldr d10, [%[b_ptr0], #0x20]\n"
                        "fmla v23.4s, v11.4s, v5.s[0]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "fmla v27.4s, v11.4s, v6.s[0]\n"
                        "ins v1.d[1], temploadreg1\n"
                        "fmla v31.4s, v11.4s, v7.s[0]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                        "fmla v16.4s, v12.4s, v4.s[1]\n"
                        "ldr d11, [%[b_ptr0], #0x30]\n"
                        "fmla v20.4s, v12.4s, v5.s[1]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
                        "fmla v24.4s, v12.4s, v6.s[1]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "fmla v28.4s, v12.4s, v7.s[1]\n"
                        "ldr d12, [%[b_ptr0], #0x40]\n"
                        "fmla v17.4s, v13.4s, v4.s[1]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
                        "fmla v21.4s, v13.4s, v5.s[1]\n"
                        "add a_ptr2, a_ptr2, #0x20\n"
                        "fmla v25.4s, v13.4s, v6.s[1]\n"
                        "ldr d2, [a_ptr2, #-0x10]\n"
                        "fmla v29.4s, v13.4s, v7.s[1]\n"
                        "ldr temploadreg2, [a_ptr2, #-0x8]\n"
                        "fmla v18.4s, v14.4s, v4.s[1]\n"
                        "ldr d13, [%[b_ptr0], #0x50]\n"
                        "fmla v22.4s, v14.4s, v5.s[1]\n"
                        "ins v12.d[1], temploadreg0\n"
                        "fmla v26.4s, v14.4s, v6.s[1]\n"
                        "ins v2.d[1], temploadreg2\n"
                        "fmla v30.4s, v14.4s, v7.s[1]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
                        "fmla v19.4s, v15.4s, v4.s[1]\n"
                        "ldr d14, [%[b_ptr0], #0x60]\n"
                        "fmla v23.4s, v15.4s, v5.s[1]\n"
                        "ins v13.d[1], temploadreg1\n"
                        "fmla v27.4s, v15.4s, v6.s[1]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "fmla v31.4s, v15.4s, v7.s[1]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                        "fmla v16.4s, v8.4s, v4.s[2]\n"
                        "ldr d15, [%[b_ptr0], #0x70]\n"
                        "fmla v20.4s, v8.4s, v5.s[2]\n"
                        "add a_ptr3, a_ptr3, #0x20\n"
                        "fmla v24.4s, v8.4s, v6.s[2]\n"
                        "ldr d3, [a_ptr3, #-0x10]\n"
                        "fmla v28.4s, v8.4s, v7.s[2]\n"
                        "ldr temploadreg3, [a_ptr3, #-0x8]\n"
                        "fmla v17.4s, v9.4s, v4.s[2]\n"
                        "ins v14.d[1], temploadreg2\n"
                        "fmla v21.4s, v9.4s, v5.s[2]\n"
                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
                        "fmla v25.4s, v9.4s, v6.s[2]\n"
                        "ins v3.d[1], temploadreg3\n"
                        "fmla v29.4s, v9.4s, v7.s[2]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
                        "fmla v18.4s, v10.4s, v4.s[2]\n"
                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
                        "fmla v22.4s, v10.4s, v5.s[2]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "fmla v26.4s, v10.4s, v6.s[2]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
                        "fmla v30.4s, v10.4s, v7.s[2]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v19.4s, v11.4s, v4.s[2]\n"
                        "ldr d8, [%[b_ptr0], #-0x80]\n"
                        "fmla v23.4s, v11.4s, v5.s[2]\n"
                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
                        "fmla v27.4s, v11.4s, v6.s[2]\n"
                        "ldr d9, [%[b_ptr0], #-0x70]\n"
                        "fmla v31.4s, v11.4s, v7.s[2]\n"
                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
                        "fmla v16.4s, v12.4s, v4.s[3]\n"
                        "ldr d10, [%[b_ptr0], #-0x60]\n"
                        "fmla v20.4s, v12.4s, v5.s[3]\n"
                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
                        "fmla v24.4s, v12.4s, v6.s[3]\n"
                        "ldr d11, [%[b_ptr0], #-0x50]\n"
                        "fmla v28.4s, v12.4s, v7.s[3]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "fmla v17.4s, v13.4s, v4.s[3]\n"
                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
                        "fmla v21.4s, v13.4s, v5.s[3]\n"
                        "ldr d12, [%[b_ptr0], #-0x40]\n"
                        "fmla v25.4s, v13.4s, v6.s[3]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "fmla v29.4s, v13.4s, v7.s[3]\n"
                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
                        "fmla v18.4s, v14.4s, v4.s[3]\n"
                        "ldr d13, [%[b_ptr0], #-0x30]\n"
                        "fmla v22.4s, v14.4s, v5.s[3]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "fmla v26.4s, v14.4s, v6.s[3]\n"
                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
                        "fmla v30.4s, v14.4s, v7.s[3]\n"
                        "ldr d14, [%[b_ptr0], #-0x20]\n"
                        "fmla v19.4s, v15.4s, v4.s[3]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "fmla v23.4s, v15.4s, v5.s[3]\n"
                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
                        "fmla v27.4s, v15.4s, v6.s[3]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "fmla v31.4s, v15.4s, v7.s[3]\n"
                        "ins v12.d[1], temploadreg0\n"
                        "ins v13.d[1], temploadreg1\n"
                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
                        "ins v14.d[1], temploadreg2\n"
                        "b.ne 3b\n"
                        "2:\n"
                        "ldr d15, [%[b_ptr0], #-0x10]\n"
                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
                        "prfm PSTL1KEEP, [c_ptr1]\n"
                        "prfm PSTL1KEEP, [c_ptr2]\n"
                        "prfm PSTL1KEEP, [c_ptr3]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "cbz %[regs], 4f\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "ldr d4, [%[a_ptr0]]\n"
                        "fmla v20.4s, v8.4s, v1.s[0]\n"
                        "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
                        "fmla v24.4s, v8.4s, v2.s[0]\n"
                        "ldr d5, [a_ptr1]\n"
                        "fmla v28.4s, v8.4s, v3.s[0]\n"
                        "ldr temploadreg1, [a_ptr1, #0x8]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "ldr d6, [a_ptr2]\n"
                        "fmla v21.4s, v9.4s, v1.s[0]\n"
                        "ldr temploadreg2, [a_ptr2, #0x8]\n"
                        "fmla v25.4s, v9.4s, v2.s[0]\n"
                        "ldr d7, [a_ptr3]\n"
                        "fmla v29.4s, v9.4s, v3.s[0]\n"
                        "ldr temploadreg3, [a_ptr3, #0x8]\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "ldr d8, [%[b_ptr0]]\n"
                        "fmla v22.4s, v10.4s, v1.s[0]\n"
                        "ins v4.d[1], temploadreg0\n"
                        "fmla v26.4s, v10.4s, v2.s[0]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                        "fmla v30.4s, v10.4s, v3.s[0]\n"
                        "ldr d9, [%[b_ptr0], #0x10]\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "ins v5.d[1], temploadreg1\n"
                        "fmla v23.4s, v11.4s, v1.s[0]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                        "fmla v27.4s, v11.4s, v2.s[0]\n"
                        "ldr d10, [%[b_ptr0], #0x20]\n"
                        "fmla v31.4s, v11.4s, v3.s[0]\n"
                        "ins v6.d[1], temploadreg2\n"
                        "fmla v16.4s, v12.4s, v0.s[1]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
                        "fmla v20.4s, v12.4s, v1.s[1]\n"
                        "ldr d11, [%[b_ptr0], #0x30]\n"
                        "fmla v24.4s, v12.4s, v2.s[1]\n"
                        "ins v7.d[1], temploadreg3\n"
                        "fmla v28.4s, v12.4s, v3.s[1]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
                        "fmla v17.4s, v13.4s, v0.s[1]\n"
                        "ldr d12, [%[b_ptr0], #0x40]\n"
                        "fmla v21.4s, v13.4s, v1.s[1]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "fmla v25.4s, v13.4s, v2.s[1]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
                        "fmla v29.4s, v13.4s, v3.s[1]\n"
                        "ldr d13, [%[b_ptr0], #0x50]\n"
                        "fmla v18.4s, v14.4s, v0.s[1]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "fmla v22.4s, v14.4s, v1.s[1]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
                        "fmla v26.4s, v14.4s, v2.s[1]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "fmla v30.4s, v14.4s, v3.s[1]\n"
                        "ldr d14, [%[b_ptr0], #0x60]\n"
                        "fmla v19.4s, v15.4s, v0.s[1]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                        "fmla v23.4s, v15.4s, v1.s[1]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "fmla v27.4s, v15.4s, v2.s[1]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
                        "fmla v31.4s, v15.4s, v3.s[1]\n"
                        "ldr d15, [%[b_ptr0], #0x70]\n"
                        "fmla v16.4s, v8.4s, v0.s[2]\n"
                        "ins v12.d[1], temploadreg0\n"
                        "fmla v20.4s, v8.4s, v1.s[2]\n"
                        "ins v13.d[1], temploadreg1\n"
                        "fmla v24.4s, v8.4s, v2.s[2]\n"
                        "ins v14.d[1], temploadreg2\n"
                        "fmla v28.4s, v8.4s, v3.s[2]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "fmla v17.4s, v9.4s, v0.s[2]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
                        "fmla v21.4s, v9.4s, v1.s[2]\n"
                        "ldr d8, [%[b_ptr0], #-0x80]\n"
                        "fmla v25.4s, v9.4s, v2.s[2]\n"
                        "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
                        "fmla v29.4s, v9.4s, v3.s[2]\n"
                        "ldr d9, [%[b_ptr0], #-0x70]\n"
                        "fmla v18.4s, v10.4s, v0.s[2]\n"
                        "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
                        "fmla v22.4s, v10.4s, v1.s[2]\n"
                        "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
                        "fmla v26.4s, v10.4s, v2.s[2]\n"
                        "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
                        "fmla v30.4s, v10.4s, v3.s[2]\n"
                        "ldr d10, [%[b_ptr0], #-0x60]\n"
                        "fmla v19.4s, v11.4s, v0.s[2]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "fmla v23.4s, v11.4s, v1.s[2]\n"
                        "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
                        "fmla v27.4s, v11.4s, v2.s[2]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "fmla v31.4s, v11.4s, v3.s[2]\n"
                        "ldr d11, [%[b_ptr0], #-0x50]\n"
                        "fmla v16.4s, v12.4s, v0.s[3]\n"
                        "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
                        "fmla v20.4s, v12.4s, v1.s[3]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "fmla v24.4s, v12.4s, v2.s[3]\n"
                        "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
                        "fmla v28.4s, v12.4s, v3.s[3]\n"
                        "ldr d12, [%[b_ptr0], #-0x40]\n"
                        "fmla v17.4s, v13.4s, v0.s[3]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "fmla v21.4s, v13.4s, v1.s[3]\n"
                        "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
                        "fmla v25.4s, v13.4s, v2.s[3]\n"
                        "ins v12.d[1], temploadreg0\n"
                        "fmla v29.4s, v13.4s, v3.s[3]\n"
                        "ldr d13, [%[b_ptr0], #-0x30]\n"
                        "fmla v18.4s, v14.4s, v0.s[3]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                        "fmla v22.4s, v14.4s, v1.s[3]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
                        "fmla v26.4s, v14.4s, v2.s[3]\n"
                        "ins v13.d[1], temploadreg1\n"
                        "fmla v30.4s, v14.4s, v3.s[3]\n"
                        "ldr d14, [%[b_ptr0], #-0x20]\n"
                        "fmla v19.4s, v15.4s, v0.s[3]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                        "fmla v23.4s, v15.4s, v1.s[3]\n"
                        "add a_ptr1, a_ptr1, #0x10\n"
                        "fmla v27.4s, v15.4s, v2.s[3]\n"
                        "ins v14.d[1], temploadreg2\n"
                        "fmla v31.4s, v15.4s, v3.s[3]\n"
                        "ldr d15, [%[b_ptr0], #-0x10]\n"
                        "fmla v16.4s, v8.4s, v4.s[0]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
                        "fmla v20.4s, v8.4s, v5.s[0]\n"
                        "add a_ptr2, a_ptr2, #0x10\n"
                        "fmla v24.4s, v8.4s, v6.s[0]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "fmla v28.4s, v8.4s, v7.s[0]\n"
                        "ldr d8, [%[b_ptr0]]\n"
                        "fmla v17.4s, v9.4s, v4.s[0]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
                        "fmla v21.4s, v9.4s, v5.s[0]\n"
                        "add a_ptr3, a_ptr3, #0x10\n"
                        "fmla v25.4s, v9.4s, v6.s[0]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "fmla v29.4s, v9.4s, v7.s[0]\n"
                        "ldr d9, [%[b_ptr0], #0x10]\n"
                        "fmla v18.4s, v10.4s, v4.s[0]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
                        "fmla v22.4s, v10.4s, v5.s[0]\n"
                        "fmla v26.4s, v10.4s, v6.s[0]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "fmla v30.4s, v10.4s, v7.s[0]\n"
                        "ldr d10, [%[b_ptr0], #0x20]\n"
                        "fmla v19.4s, v11.4s, v4.s[0]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
                        "fmla v23.4s, v11.4s, v5.s[0]\n"
                        "fmla v27.4s, v11.4s, v6.s[0]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "fmla v31.4s, v11.4s, v7.s[0]\n"
                        "ldr d11, [%[b_ptr0], #0x30]\n"
                        "fmla v16.4s, v12.4s, v4.s[1]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                        "fmla v20.4s, v12.4s, v5.s[1]\n"
                        "fmla v24.4s, v12.4s, v6.s[1]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "fmla v28.4s, v12.4s, v7.s[1]\n"
                        "ldr d12, [%[b_ptr0], #0x40]\n"
                        "fmla v17.4s, v13.4s, v4.s[1]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
                        "fmla v21.4s, v13.4s, v5.s[1]\n"
                        "fmla v25.4s, v13.4s, v6.s[1]\n"
                        "ins v12.d[1], temploadreg0\n"
                        "fmla v29.4s, v13.4s, v7.s[1]\n"
                        "ldr d13, [%[b_ptr0], #0x50]\n"
                        "fmla v18.4s, v14.4s, v4.s[1]\n"
                        "fmla v22.4s, v14.4s, v5.s[1]\n"
                        "fmla v26.4s, v14.4s, v6.s[1]\n"
                        "ins v13.d[1], temploadreg1\n"
                        "fmla v30.4s, v14.4s, v7.s[1]\n"
                        "ldr d14, [%[b_ptr0], #0x60]\n"
                        "fmla v19.4s, v15.4s, v4.s[1]\n"
                        "fmla v23.4s, v15.4s, v5.s[1]\n"
                        "fmla v27.4s, v15.4s, v6.s[1]\n"
                        "ins v14.d[1], temploadreg2\n"
                        "fmla v31.4s, v15.4s, v7.s[1]\n"
                        "ldr d15, [%[b_ptr0], #0x70]\n"
                        "fmla v16.4s, v8.4s, v4.s[2]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
                        "fmla v20.4s, v8.4s, v5.s[2]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "fmla v24.4s, v8.4s, v6.s[2]\n"
                        "fmla v28.4s, v8.4s, v7.s[2]\n"
                        "fmla v17.4s, v9.4s, v4.s[2]\n"
                        "fmla v21.4s, v9.4s, v5.s[2]\n"
                        "fmla v25.4s, v9.4s, v6.s[2]\n"
                        "fmla v29.4s, v9.4s, v7.s[2]\n"
                        "fmla v18.4s, v10.4s, v4.s[2]\n"
                        "fmla v22.4s, v10.4s, v5.s[2]\n"
                        "fmla v26.4s, v10.4s, v6.s[2]\n"
                        "fmla v30.4s, v10.4s, v7.s[2]\n"
                        "fmla v19.4s, v11.4s, v4.s[2]\n"
                        "fmla v23.4s, v11.4s, v5.s[2]\n"
                        "fmla v27.4s, v11.4s, v6.s[2]\n"
                        "fmla v31.4s, v11.4s, v7.s[2]\n"
                        "fmla v16.4s, v12.4s, v4.s[3]\n"
                        "fmla v20.4s, v12.4s, v5.s[3]\n"
                        "fmla v24.4s, v12.4s, v6.s[3]\n"
                        "fmla v28.4s, v12.4s, v7.s[3]\n"
                        "fmla v17.4s, v13.4s, v4.s[3]\n"
                        "fmla v21.4s, v13.4s, v5.s[3]\n"
                        "fmla v25.4s, v13.4s, v6.s[3]\n"
                        "fmla v29.4s, v13.4s, v7.s[3]\n"
                        "fmla v18.4s, v14.4s, v4.s[3]\n"
                        "fmla v22.4s, v14.4s, v5.s[3]\n"
                        "fmla v26.4s, v14.4s, v6.s[3]\n"
                        "fmla v30.4s, v14.4s, v7.s[3]\n"
                        "fmla v19.4s, v15.4s, v4.s[3]\n"
                        "fmla v23.4s, v15.4s, v5.s[3]\n"
                        "fmla v27.4s, v15.4s, v6.s[3]\n"
                        "fmla v31.4s, v15.4s, v7.s[3]\n"
                        "b 5f\n"
                        "4:\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
                        "fmla v20.4s, v8.4s, v1.s[0]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
                        "fmla v24.4s, v8.4s, v2.s[0]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
                        "fmla v28.4s, v8.4s, v3.s[0]\n"
                        "ldr d8, [%[b_ptr0]]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
                        "fmla v21.4s, v9.4s, v1.s[0]\n"
                        "fmla v25.4s, v9.4s, v2.s[0]\n"
                        "ins v8.d[1], temploadreg0\n"
                        "fmla v29.4s, v9.4s, v3.s[0]\n"
                        "ldr d9, [%[b_ptr0], #0x10]\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
                        "fmla v22.4s, v10.4s, v1.s[0]\n"
                        "fmla v26.4s, v10.4s, v2.s[0]\n"
                        "ins v9.d[1], temploadreg1\n"
                        "fmla v30.4s, v10.4s, v3.s[0]\n"
                        "ldr d10, [%[b_ptr0], #0x20]\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
                        "fmla v23.4s, v11.4s, v1.s[0]\n"
                        "fmla v27.4s, v11.4s, v2.s[0]\n"
                        "ins v10.d[1], temploadreg2\n"
                        "fmla v31.4s, v11.4s, v3.s[0]\n"
                        "ldr d11, [%[b_ptr0], #0x30]\n"
                        "fmla v16.4s, v12.4s, v0.s[1]\n"
                        "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
                        "fmla v20.4s, v12.4s, v1.s[1]\n"
                        "fmla v24.4s, v12.4s, v2.s[1]\n"
                        "ins v11.d[1], temploadreg3\n"
                        "fmla v28.4s, v12.4s, v3.s[1]\n"
                        "ldr d12, [%[b_ptr0], #0x40]\n"
                        "fmla v17.4s, v13.4s, v0.s[1]\n"
                        "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
                        "fmla v21.4s, v13.4s, v1.s[1]\n"
                        "fmla v25.4s, v13.4s, v2.s[1]\n"
                        "ins v12.d[1], temploadreg0\n"
                        "fmla v29.4s, v13.4s, v3.s[1]\n"
                        "ldr d13, [%[b_ptr0], #0x50]\n"
                        "fmla v18.4s, v14.4s, v0.s[1]\n"
                        "fmla v22.4s, v14.4s, v1.s[1]\n"
                        "fmla v26.4s, v14.4s, v2.s[1]\n"
                        "ins v13.d[1], temploadreg1\n"
                        "fmla v30.4s, v14.4s, v3.s[1]\n"
                        "ldr d14, [%[b_ptr0], #0x60]\n"
                        "fmla v19.4s, v15.4s, v0.s[1]\n"
                        "fmla v23.4s, v15.4s, v1.s[1]\n"
                        "fmla v27.4s, v15.4s, v2.s[1]\n"
                        "ins v14.d[1], temploadreg2\n"
                        "fmla v31.4s, v15.4s, v3.s[1]\n"
                        "ldr d15, [%[b_ptr0], #0x70]\n"
                        "fmla v16.4s, v8.4s, v0.s[2]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x80\n"
                        "fmla v20.4s, v8.4s, v1.s[2]\n"
                        "ins v15.d[1], temploadreg3\n"
                        "fmla v24.4s, v8.4s, v2.s[2]\n"
                        "fmla v28.4s, v8.4s, v3.s[2]\n"
                        "fmla v17.4s, v9.4s, v0.s[2]\n"
                        "fmla v21.4s, v9.4s, v1.s[2]\n"
                        "fmla v25.4s, v9.4s, v2.s[2]\n"
                        "fmla v29.4s, v9.4s, v3.s[2]\n"
                        "fmla v18.4s, v10.4s, v0.s[2]\n"
                        "fmla v22.4s, v10.4s, v1.s[2]\n"
                        "fmla v26.4s, v10.4s, v2.s[2]\n"
                        "fmla v30.4s, v10.4s, v3.s[2]\n"
                        "fmla v19.4s, v11.4s, v0.s[2]\n"
                        "fmla v23.4s, v11.4s, v1.s[2]\n"
                        "fmla v27.4s, v11.4s, v2.s[2]\n"
                        "fmla v31.4s, v11.4s, v3.s[2]\n"
                        "fmla v16.4s, v12.4s, v0.s[3]\n"
                        "fmla v20.4s, v12.4s, v1.s[3]\n"
                        "fmla v24.4s, v12.4s, v2.s[3]\n"
                        "fmla v28.4s, v12.4s, v3.s[3]\n"
                        "fmla v17.4s, v13.4s, v0.s[3]\n"
                        "fmla v21.4s, v13.4s, v1.s[3]\n"
                        "fmla v25.4s, v13.4s, v2.s[3]\n"
                        "fmla v29.4s, v13.4s, v3.s[3]\n"
                        "fmla v18.4s, v14.4s, v0.s[3]\n"
                        "fmla v22.4s, v14.4s, v1.s[3]\n"
                        "fmla v26.4s, v14.4s, v2.s[3]\n"
                        "fmla v30.4s, v14.4s, v3.s[3]\n"
                        "fmla v19.4s, v15.4s, v0.s[3]\n"
                        "fmla v23.4s, v15.4s, v1.s[3]\n"
                        "fmla v27.4s, v15.4s, v2.s[3]\n"
                        "fmla v31.4s, v15.4s, v3.s[3]\n"
                        "5:\n"
                        "cbz %[blocks], 6f\n"
                        "7:\n"
                        "ldr q8, [%[b_ptr0]]\n"
                        "subs %[blocks], %[blocks], #0x1\n"
                        "ldr q9, [%[b_ptr0], #0x10]\n"
                        "ldr s0, [%[a_ptr0]]\n"
                        "ldr q10, [%[b_ptr0], #0x20]\n"
                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
                        "ldr q11, [%[b_ptr0], #0x30]\n"
                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
                        "fmla v16.4s, v8.4s, v0.s[0]\n"
                        "ldr s1, [a_ptr1]\n"
                        "fmla v17.4s, v9.4s, v0.s[0]\n"
                        "add a_ptr1, a_ptr1, #0x4\n"
                        "fmla v18.4s, v10.4s, v0.s[0]\n"
                        "ldr s2, [a_ptr2]\n"
                        "fmla v20.4s, v8.4s, v1.s[0]\n"
                        "add a_ptr2, a_ptr2, #0x4\n"
                        "fmla v21.4s, v9.4s, v1.s[0]\n"
                        "ldr s3, [a_ptr3]\n"
                        "fmla v24.4s, v8.4s, v2.s[0]\n"
                        "add a_ptr3, a_ptr3, #0x4\n"
                        "fmla v25.4s, v9.4s, v2.s[0]\n"
                        "fmla v28.4s, v8.4s, v3.s[0]\n"
                        "fmla v29.4s, v9.4s, v3.s[0]\n"
                        "fmla v22.4s, v10.4s, v1.s[0]\n"
                        "fmla v26.4s, v10.4s, v2.s[0]\n"
                        "fmla v30.4s, v10.4s, v3.s[0]\n"
                        "fmla v19.4s, v11.4s, v0.s[0]\n"
                        "fmla v23.4s, v11.4s, v1.s[0]\n"
                        "fmla v27.4s, v11.4s, v2.s[0]\n"
                        "fmla v31.4s, v11.4s, v3.s[0]\n"
                        "b.ne 7b\n"
                        "6:\n"
                        "ld1r {v14.4s}, [%[minptr]]\n"
                        "ld1r {v15.4s}, [%[maxptr]]\n"
                        "fmax v16.4s, v16.4s, v14.4s\n"
                        "fmax v17.4s, v17.4s, v14.4s\n"
                        "fmax v18.4s, v18.4s, v14.4s\n"
                        "fmax v19.4s, v19.4s, v14.4s\n"
                        "fmin v16.4s, v16.4s, v15.4s\n"
                        "fmin v17.4s, v17.4s, v15.4s\n"
                        "fmin v18.4s, v18.4s, v15.4s\n"
                        "fmin v19.4s, v19.4s, v15.4s\n"
                        "str q16, [%[c_ptr0]]\n"
                        "fmax v20.4s, v20.4s, v14.4s\n"
                        "fmax v21.4s, v21.4s, v14.4s\n"
                        "fmax v22.4s, v22.4s, v14.4s\n"
                        "str q17, [%[c_ptr0], #0x10]\n"
                        "fmax v23.4s, v23.4s, v14.4s\n"
                        "fmin v20.4s, v20.4s, v15.4s\n"
                        "fmin v21.4s, v21.4s, v15.4s\n"
                        "str q18, [%[c_ptr0], #0x20]\n"
                        "fmin v22.4s, v22.4s, v15.4s\n"
                        "fmin v23.4s, v23.4s, v15.4s\n"
                        "fmax v24.4s, v24.4s, v14.4s\n"
                        "str q19, [%[c_ptr0], #0x30]\n"
                        "fmax v25.4s, v25.4s, v14.4s\n"
                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
                        "fmax v26.4s, v26.4s, v14.4s\n"
                        "str q20, [c_ptr1]\n"
                        "fmin v24.4s, v24.4s, v15.4s\n"
                        "fmin v25.4s, v25.4s, v15.4s\n"
                        "fmax v27.4s, v27.4s, v14.4s\n"
                        "str q21, [c_ptr1, #0x10]\n"
                        "fmin v26.4s, v26.4s, v15.4s\n"
                        "fmax v28.4s, v28.4s, v14.4s\n"
                        "fmax v29.4s, v29.4s, v14.4s\n"
                        "str q22, [c_ptr1, #0x20]\n"
                        "fmin v27.4s, v27.4s, v15.4s\n"
                        "fmax v30.4s, v30.4s, v14.4s\n"
                        "fmin v28.4s, v28.4s, v15.4s\n"
                        "str q23, [c_ptr1, #0x30]\n"
                        "fmin v29.4s, v29.4s, v15.4s\n"
                        "fmax v31.4s, v31.4s, v14.4s\n"
                        "fmin v30.4s, v30.4s, v15.4s\n"
                        "str q24, [c_ptr2]\n"
                        "fmin v31.4s, v31.4s, v15.4s\n"
                        "str q25, [c_ptr2, #0x10]\n"
                        "str q26, [c_ptr2, #0x20]\n"
                        "str q27, [c_ptr2, #0x30]\n"
                        "str q28, [c_ptr3]\n"
                        "str q29, [c_ptr3, #0x10]\n"
                        "str q30, [c_ptr3, #0x20]\n"
                        "str q31, [c_ptr3, #0x30]\n"
                        ".unreq a_ptr1\n"
                        ".unreq a_ptr2\n"
                        ".unreq a_ptr3\n"
                        ".unreq c_ptr1\n"
                        ".unreq c_ptr2\n"
                        ".unreq c_ptr3\n"
                        ".unreq temploadreg0\n"
                        ".unreq temploadreg1\n"
                        ".unreq temploadreg2\n"
                        ".unreq temploadreg3\n"
                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
                    );
                    break;
            }
            if (use_result_buffer) {
                for(int cy=0; cy<std::min(M-y, 4); cy++) {
                    for(unsigned int cx=0; cx<width; cx++) {
                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
                    }
                }
            }
        }
    }
}

} // namespace arm_gemm

#endif // __aarch64__
